1#if defined(__x86_64__)
2
3#include "llvm_blake3_prefix.h"
4
5#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__))
6.section .note.GNU-stack,"",%progbits
7#endif
8
9#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
10#if __has_include(<cet.h>)
11#include <cet.h>
12#endif
13#endif
14
15#if !defined(_CET_ENDBR)
16#define _CET_ENDBR
17#endif
18
19#ifdef __APPLE__
20#define HIDDEN .private_extern
21#else
22#define HIDDEN .hidden
23#endif
24
25.intel_syntax noprefix
26HIDDEN blake3_hash_many_sse41
27HIDDEN _blake3_hash_many_sse41
28HIDDEN blake3_compress_in_place_sse41
29HIDDEN _blake3_compress_in_place_sse41
30HIDDEN blake3_compress_xof_sse41
31HIDDEN _blake3_compress_xof_sse41
32.global blake3_hash_many_sse41
33.global _blake3_hash_many_sse41
34.global blake3_compress_in_place_sse41
35.global _blake3_compress_in_place_sse41
36.global blake3_compress_xof_sse41
37.global _blake3_compress_xof_sse41
38#ifdef __APPLE__
39.text
40#else
41.section .text
42#endif
43 .p2align 6
44_blake3_hash_many_sse41:
45blake3_hash_many_sse41:
46 _CET_ENDBR
47 push r15
48 push r14
49 push r13
50 push r12
51 push rbx
52 push rbp
53 mov rbp, rsp
54 sub rsp, 360
55 and rsp, 0xFFFFFFFFFFFFFFC0
56 neg r9d
57 movd xmm0, r9d
58 pshufd xmm0, xmm0, 0x00
59 movdqa xmmword ptr [rsp+0x130], xmm0
60 movdqa xmm1, xmm0
61 pand xmm1, xmmword ptr [ADD0+rip]
62 pand xmm0, xmmword ptr [ADD1+rip]
63 movdqa xmmword ptr [rsp+0x150], xmm0
64 movd xmm0, r8d
65 pshufd xmm0, xmm0, 0x00
66 paddd xmm0, xmm1
67 movdqa xmmword ptr [rsp+0x110], xmm0
68 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
69 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
70 pcmpgtd xmm1, xmm0
71 shr r8, 32
72 movd xmm2, r8d
73 pshufd xmm2, xmm2, 0x00
74 psubd xmm2, xmm1
75 movdqa xmmword ptr [rsp+0x120], xmm2
76 mov rbx, qword ptr [rbp+0x50]
77 mov r15, rdx
78 shl r15, 6
79 movzx r13d, byte ptr [rbp+0x38]
80 movzx r12d, byte ptr [rbp+0x48]
81 cmp rsi, 4
82 jc 3f
832:
84 movdqu xmm3, xmmword ptr [rcx]
85 pshufd xmm0, xmm3, 0x00
86 pshufd xmm1, xmm3, 0x55
87 pshufd xmm2, xmm3, 0xAA
88 pshufd xmm3, xmm3, 0xFF
89 movdqu xmm7, xmmword ptr [rcx+0x10]
90 pshufd xmm4, xmm7, 0x00
91 pshufd xmm5, xmm7, 0x55
92 pshufd xmm6, xmm7, 0xAA
93 pshufd xmm7, xmm7, 0xFF
94 mov r8, qword ptr [rdi]
95 mov r9, qword ptr [rdi+0x8]
96 mov r10, qword ptr [rdi+0x10]
97 mov r11, qword ptr [rdi+0x18]
98 movzx eax, byte ptr [rbp+0x40]
99 or eax, r13d
100 xor edx, edx
1019:
102 mov r14d, eax
103 or eax, r12d
104 add rdx, 64
105 cmp rdx, r15
106 cmovne eax, r14d
107 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
108 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
109 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
110 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
111 movdqa xmm12, xmm8
112 punpckldq xmm8, xmm9
113 punpckhdq xmm12, xmm9
114 movdqa xmm14, xmm10
115 punpckldq xmm10, xmm11
116 punpckhdq xmm14, xmm11
117 movdqa xmm9, xmm8
118 punpcklqdq xmm8, xmm10
119 punpckhqdq xmm9, xmm10
120 movdqa xmm13, xmm12
121 punpcklqdq xmm12, xmm14
122 punpckhqdq xmm13, xmm14
123 movdqa xmmword ptr [rsp], xmm8
124 movdqa xmmword ptr [rsp+0x10], xmm9
125 movdqa xmmword ptr [rsp+0x20], xmm12
126 movdqa xmmword ptr [rsp+0x30], xmm13
127 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
128 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
129 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
130 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
131 movdqa xmm12, xmm8
132 punpckldq xmm8, xmm9
133 punpckhdq xmm12, xmm9
134 movdqa xmm14, xmm10
135 punpckldq xmm10, xmm11
136 punpckhdq xmm14, xmm11
137 movdqa xmm9, xmm8
138 punpcklqdq xmm8, xmm10
139 punpckhqdq xmm9, xmm10
140 movdqa xmm13, xmm12
141 punpcklqdq xmm12, xmm14
142 punpckhqdq xmm13, xmm14
143 movdqa xmmword ptr [rsp+0x40], xmm8
144 movdqa xmmword ptr [rsp+0x50], xmm9
145 movdqa xmmword ptr [rsp+0x60], xmm12
146 movdqa xmmword ptr [rsp+0x70], xmm13
147 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
148 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
149 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
150 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
151 movdqa xmm12, xmm8
152 punpckldq xmm8, xmm9
153 punpckhdq xmm12, xmm9
154 movdqa xmm14, xmm10
155 punpckldq xmm10, xmm11
156 punpckhdq xmm14, xmm11
157 movdqa xmm9, xmm8
158 punpcklqdq xmm8, xmm10
159 punpckhqdq xmm9, xmm10
160 movdqa xmm13, xmm12
161 punpcklqdq xmm12, xmm14
162 punpckhqdq xmm13, xmm14
163 movdqa xmmword ptr [rsp+0x80], xmm8
164 movdqa xmmword ptr [rsp+0x90], xmm9
165 movdqa xmmword ptr [rsp+0xA0], xmm12
166 movdqa xmmword ptr [rsp+0xB0], xmm13
167 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
168 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
169 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
170 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
171 movdqa xmm12, xmm8
172 punpckldq xmm8, xmm9
173 punpckhdq xmm12, xmm9
174 movdqa xmm14, xmm10
175 punpckldq xmm10, xmm11
176 punpckhdq xmm14, xmm11
177 movdqa xmm9, xmm8
178 punpcklqdq xmm8, xmm10
179 punpckhqdq xmm9, xmm10
180 movdqa xmm13, xmm12
181 punpcklqdq xmm12, xmm14
182 punpckhqdq xmm13, xmm14
183 movdqa xmmword ptr [rsp+0xC0], xmm8
184 movdqa xmmword ptr [rsp+0xD0], xmm9
185 movdqa xmmword ptr [rsp+0xE0], xmm12
186 movdqa xmmword ptr [rsp+0xF0], xmm13
187 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
188 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
189 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
190 movdqa xmm12, xmmword ptr [rsp+0x110]
191 movdqa xmm13, xmmword ptr [rsp+0x120]
192 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
193 movd xmm15, eax
194 pshufd xmm15, xmm15, 0x00
195 prefetcht0 [r8+rdx+0x80]
196 prefetcht0 [r9+rdx+0x80]
197 prefetcht0 [r10+rdx+0x80]
198 prefetcht0 [r11+rdx+0x80]
199 paddd xmm0, xmmword ptr [rsp]
200 paddd xmm1, xmmword ptr [rsp+0x20]
201 paddd xmm2, xmmword ptr [rsp+0x40]
202 paddd xmm3, xmmword ptr [rsp+0x60]
203 paddd xmm0, xmm4
204 paddd xmm1, xmm5
205 paddd xmm2, xmm6
206 paddd xmm3, xmm7
207 pxor xmm12, xmm0
208 pxor xmm13, xmm1
209 pxor xmm14, xmm2
210 pxor xmm15, xmm3
211 movdqa xmm8, xmmword ptr [ROT16+rip]
212 pshufb xmm12, xmm8
213 pshufb xmm13, xmm8
214 pshufb xmm14, xmm8
215 pshufb xmm15, xmm8
216 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
217 paddd xmm8, xmm12
218 paddd xmm9, xmm13
219 paddd xmm10, xmm14
220 paddd xmm11, xmm15
221 pxor xmm4, xmm8
222 pxor xmm5, xmm9
223 pxor xmm6, xmm10
224 pxor xmm7, xmm11
225 movdqa xmmword ptr [rsp+0x100], xmm8
226 movdqa xmm8, xmm4
227 psrld xmm8, 12
228 pslld xmm4, 20
229 por xmm4, xmm8
230 movdqa xmm8, xmm5
231 psrld xmm8, 12
232 pslld xmm5, 20
233 por xmm5, xmm8
234 movdqa xmm8, xmm6
235 psrld xmm8, 12
236 pslld xmm6, 20
237 por xmm6, xmm8
238 movdqa xmm8, xmm7
239 psrld xmm8, 12
240 pslld xmm7, 20
241 por xmm7, xmm8
242 paddd xmm0, xmmword ptr [rsp+0x10]
243 paddd xmm1, xmmword ptr [rsp+0x30]
244 paddd xmm2, xmmword ptr [rsp+0x50]
245 paddd xmm3, xmmword ptr [rsp+0x70]
246 paddd xmm0, xmm4
247 paddd xmm1, xmm5
248 paddd xmm2, xmm6
249 paddd xmm3, xmm7
250 pxor xmm12, xmm0
251 pxor xmm13, xmm1
252 pxor xmm14, xmm2
253 pxor xmm15, xmm3
254 movdqa xmm8, xmmword ptr [ROT8+rip]
255 pshufb xmm12, xmm8
256 pshufb xmm13, xmm8
257 pshufb xmm14, xmm8
258 pshufb xmm15, xmm8
259 movdqa xmm8, xmmword ptr [rsp+0x100]
260 paddd xmm8, xmm12
261 paddd xmm9, xmm13
262 paddd xmm10, xmm14
263 paddd xmm11, xmm15
264 pxor xmm4, xmm8
265 pxor xmm5, xmm9
266 pxor xmm6, xmm10
267 pxor xmm7, xmm11
268 movdqa xmmword ptr [rsp+0x100], xmm8
269 movdqa xmm8, xmm4
270 psrld xmm8, 7
271 pslld xmm4, 25
272 por xmm4, xmm8
273 movdqa xmm8, xmm5
274 psrld xmm8, 7
275 pslld xmm5, 25
276 por xmm5, xmm8
277 movdqa xmm8, xmm6
278 psrld xmm8, 7
279 pslld xmm6, 25
280 por xmm6, xmm8
281 movdqa xmm8, xmm7
282 psrld xmm8, 7
283 pslld xmm7, 25
284 por xmm7, xmm8
285 paddd xmm0, xmmword ptr [rsp+0x80]
286 paddd xmm1, xmmword ptr [rsp+0xA0]
287 paddd xmm2, xmmword ptr [rsp+0xC0]
288 paddd xmm3, xmmword ptr [rsp+0xE0]
289 paddd xmm0, xmm5
290 paddd xmm1, xmm6
291 paddd xmm2, xmm7
292 paddd xmm3, xmm4
293 pxor xmm15, xmm0
294 pxor xmm12, xmm1
295 pxor xmm13, xmm2
296 pxor xmm14, xmm3
297 movdqa xmm8, xmmword ptr [ROT16+rip]
298 pshufb xmm15, xmm8
299 pshufb xmm12, xmm8
300 pshufb xmm13, xmm8
301 pshufb xmm14, xmm8
302 paddd xmm10, xmm15
303 paddd xmm11, xmm12
304 movdqa xmm8, xmmword ptr [rsp+0x100]
305 paddd xmm8, xmm13
306 paddd xmm9, xmm14
307 pxor xmm5, xmm10
308 pxor xmm6, xmm11
309 pxor xmm7, xmm8
310 pxor xmm4, xmm9
311 movdqa xmmword ptr [rsp+0x100], xmm8
312 movdqa xmm8, xmm5
313 psrld xmm8, 12
314 pslld xmm5, 20
315 por xmm5, xmm8
316 movdqa xmm8, xmm6
317 psrld xmm8, 12
318 pslld xmm6, 20
319 por xmm6, xmm8
320 movdqa xmm8, xmm7
321 psrld xmm8, 12
322 pslld xmm7, 20
323 por xmm7, xmm8
324 movdqa xmm8, xmm4
325 psrld xmm8, 12
326 pslld xmm4, 20
327 por xmm4, xmm8
328 paddd xmm0, xmmword ptr [rsp+0x90]
329 paddd xmm1, xmmword ptr [rsp+0xB0]
330 paddd xmm2, xmmword ptr [rsp+0xD0]
331 paddd xmm3, xmmword ptr [rsp+0xF0]
332 paddd xmm0, xmm5
333 paddd xmm1, xmm6
334 paddd xmm2, xmm7
335 paddd xmm3, xmm4
336 pxor xmm15, xmm0
337 pxor xmm12, xmm1
338 pxor xmm13, xmm2
339 pxor xmm14, xmm3
340 movdqa xmm8, xmmword ptr [ROT8+rip]
341 pshufb xmm15, xmm8
342 pshufb xmm12, xmm8
343 pshufb xmm13, xmm8
344 pshufb xmm14, xmm8
345 paddd xmm10, xmm15
346 paddd xmm11, xmm12
347 movdqa xmm8, xmmword ptr [rsp+0x100]
348 paddd xmm8, xmm13
349 paddd xmm9, xmm14
350 pxor xmm5, xmm10
351 pxor xmm6, xmm11
352 pxor xmm7, xmm8
353 pxor xmm4, xmm9
354 movdqa xmmword ptr [rsp+0x100], xmm8
355 movdqa xmm8, xmm5
356 psrld xmm8, 7
357 pslld xmm5, 25
358 por xmm5, xmm8
359 movdqa xmm8, xmm6
360 psrld xmm8, 7
361 pslld xmm6, 25
362 por xmm6, xmm8
363 movdqa xmm8, xmm7
364 psrld xmm8, 7
365 pslld xmm7, 25
366 por xmm7, xmm8
367 movdqa xmm8, xmm4
368 psrld xmm8, 7
369 pslld xmm4, 25
370 por xmm4, xmm8
371 paddd xmm0, xmmword ptr [rsp+0x20]
372 paddd xmm1, xmmword ptr [rsp+0x30]
373 paddd xmm2, xmmword ptr [rsp+0x70]
374 paddd xmm3, xmmword ptr [rsp+0x40]
375 paddd xmm0, xmm4
376 paddd xmm1, xmm5
377 paddd xmm2, xmm6
378 paddd xmm3, xmm7
379 pxor xmm12, xmm0
380 pxor xmm13, xmm1
381 pxor xmm14, xmm2
382 pxor xmm15, xmm3
383 movdqa xmm8, xmmword ptr [ROT16+rip]
384 pshufb xmm12, xmm8
385 pshufb xmm13, xmm8
386 pshufb xmm14, xmm8
387 pshufb xmm15, xmm8
388 movdqa xmm8, xmmword ptr [rsp+0x100]
389 paddd xmm8, xmm12
390 paddd xmm9, xmm13
391 paddd xmm10, xmm14
392 paddd xmm11, xmm15
393 pxor xmm4, xmm8
394 pxor xmm5, xmm9
395 pxor xmm6, xmm10
396 pxor xmm7, xmm11
397 movdqa xmmword ptr [rsp+0x100], xmm8
398 movdqa xmm8, xmm4
399 psrld xmm8, 12
400 pslld xmm4, 20
401 por xmm4, xmm8
402 movdqa xmm8, xmm5
403 psrld xmm8, 12
404 pslld xmm5, 20
405 por xmm5, xmm8
406 movdqa xmm8, xmm6
407 psrld xmm8, 12
408 pslld xmm6, 20
409 por xmm6, xmm8
410 movdqa xmm8, xmm7
411 psrld xmm8, 12
412 pslld xmm7, 20
413 por xmm7, xmm8
414 paddd xmm0, xmmword ptr [rsp+0x60]
415 paddd xmm1, xmmword ptr [rsp+0xA0]
416 paddd xmm2, xmmword ptr [rsp]
417 paddd xmm3, xmmword ptr [rsp+0xD0]
418 paddd xmm0, xmm4
419 paddd xmm1, xmm5
420 paddd xmm2, xmm6
421 paddd xmm3, xmm7
422 pxor xmm12, xmm0
423 pxor xmm13, xmm1
424 pxor xmm14, xmm2
425 pxor xmm15, xmm3
426 movdqa xmm8, xmmword ptr [ROT8+rip]
427 pshufb xmm12, xmm8
428 pshufb xmm13, xmm8
429 pshufb xmm14, xmm8
430 pshufb xmm15, xmm8
431 movdqa xmm8, xmmword ptr [rsp+0x100]
432 paddd xmm8, xmm12
433 paddd xmm9, xmm13
434 paddd xmm10, xmm14
435 paddd xmm11, xmm15
436 pxor xmm4, xmm8
437 pxor xmm5, xmm9
438 pxor xmm6, xmm10
439 pxor xmm7, xmm11
440 movdqa xmmword ptr [rsp+0x100], xmm8
441 movdqa xmm8, xmm4
442 psrld xmm8, 7
443 pslld xmm4, 25
444 por xmm4, xmm8
445 movdqa xmm8, xmm5
446 psrld xmm8, 7
447 pslld xmm5, 25
448 por xmm5, xmm8
449 movdqa xmm8, xmm6
450 psrld xmm8, 7
451 pslld xmm6, 25
452 por xmm6, xmm8
453 movdqa xmm8, xmm7
454 psrld xmm8, 7
455 pslld xmm7, 25
456 por xmm7, xmm8
457 paddd xmm0, xmmword ptr [rsp+0x10]
458 paddd xmm1, xmmword ptr [rsp+0xC0]
459 paddd xmm2, xmmword ptr [rsp+0x90]
460 paddd xmm3, xmmword ptr [rsp+0xF0]
461 paddd xmm0, xmm5
462 paddd xmm1, xmm6
463 paddd xmm2, xmm7
464 paddd xmm3, xmm4
465 pxor xmm15, xmm0
466 pxor xmm12, xmm1
467 pxor xmm13, xmm2
468 pxor xmm14, xmm3
469 movdqa xmm8, xmmword ptr [ROT16+rip]
470 pshufb xmm15, xmm8
471 pshufb xmm12, xmm8
472 pshufb xmm13, xmm8
473 pshufb xmm14, xmm8
474 paddd xmm10, xmm15
475 paddd xmm11, xmm12
476 movdqa xmm8, xmmword ptr [rsp+0x100]
477 paddd xmm8, xmm13
478 paddd xmm9, xmm14
479 pxor xmm5, xmm10
480 pxor xmm6, xmm11
481 pxor xmm7, xmm8
482 pxor xmm4, xmm9
483 movdqa xmmword ptr [rsp+0x100], xmm8
484 movdqa xmm8, xmm5
485 psrld xmm8, 12
486 pslld xmm5, 20
487 por xmm5, xmm8
488 movdqa xmm8, xmm6
489 psrld xmm8, 12
490 pslld xmm6, 20
491 por xmm6, xmm8
492 movdqa xmm8, xmm7
493 psrld xmm8, 12
494 pslld xmm7, 20
495 por xmm7, xmm8
496 movdqa xmm8, xmm4
497 psrld xmm8, 12
498 pslld xmm4, 20
499 por xmm4, xmm8
500 paddd xmm0, xmmword ptr [rsp+0xB0]
501 paddd xmm1, xmmword ptr [rsp+0x50]
502 paddd xmm2, xmmword ptr [rsp+0xE0]
503 paddd xmm3, xmmword ptr [rsp+0x80]
504 paddd xmm0, xmm5
505 paddd xmm1, xmm6
506 paddd xmm2, xmm7
507 paddd xmm3, xmm4
508 pxor xmm15, xmm0
509 pxor xmm12, xmm1
510 pxor xmm13, xmm2
511 pxor xmm14, xmm3
512 movdqa xmm8, xmmword ptr [ROT8+rip]
513 pshufb xmm15, xmm8
514 pshufb xmm12, xmm8
515 pshufb xmm13, xmm8
516 pshufb xmm14, xmm8
517 paddd xmm10, xmm15
518 paddd xmm11, xmm12
519 movdqa xmm8, xmmword ptr [rsp+0x100]
520 paddd xmm8, xmm13
521 paddd xmm9, xmm14
522 pxor xmm5, xmm10
523 pxor xmm6, xmm11
524 pxor xmm7, xmm8
525 pxor xmm4, xmm9
526 movdqa xmmword ptr [rsp+0x100], xmm8
527 movdqa xmm8, xmm5
528 psrld xmm8, 7
529 pslld xmm5, 25
530 por xmm5, xmm8
531 movdqa xmm8, xmm6
532 psrld xmm8, 7
533 pslld xmm6, 25
534 por xmm6, xmm8
535 movdqa xmm8, xmm7
536 psrld xmm8, 7
537 pslld xmm7, 25
538 por xmm7, xmm8
539 movdqa xmm8, xmm4
540 psrld xmm8, 7
541 pslld xmm4, 25
542 por xmm4, xmm8
543 paddd xmm0, xmmword ptr [rsp+0x30]
544 paddd xmm1, xmmword ptr [rsp+0xA0]
545 paddd xmm2, xmmword ptr [rsp+0xD0]
546 paddd xmm3, xmmword ptr [rsp+0x70]
547 paddd xmm0, xmm4
548 paddd xmm1, xmm5
549 paddd xmm2, xmm6
550 paddd xmm3, xmm7
551 pxor xmm12, xmm0
552 pxor xmm13, xmm1
553 pxor xmm14, xmm2
554 pxor xmm15, xmm3
555 movdqa xmm8, xmmword ptr [ROT16+rip]
556 pshufb xmm12, xmm8
557 pshufb xmm13, xmm8
558 pshufb xmm14, xmm8
559 pshufb xmm15, xmm8
560 movdqa xmm8, xmmword ptr [rsp+0x100]
561 paddd xmm8, xmm12
562 paddd xmm9, xmm13
563 paddd xmm10, xmm14
564 paddd xmm11, xmm15
565 pxor xmm4, xmm8
566 pxor xmm5, xmm9
567 pxor xmm6, xmm10
568 pxor xmm7, xmm11
569 movdqa xmmword ptr [rsp+0x100], xmm8
570 movdqa xmm8, xmm4
571 psrld xmm8, 12
572 pslld xmm4, 20
573 por xmm4, xmm8
574 movdqa xmm8, xmm5
575 psrld xmm8, 12
576 pslld xmm5, 20
577 por xmm5, xmm8
578 movdqa xmm8, xmm6
579 psrld xmm8, 12
580 pslld xmm6, 20
581 por xmm6, xmm8
582 movdqa xmm8, xmm7
583 psrld xmm8, 12
584 pslld xmm7, 20
585 por xmm7, xmm8
586 paddd xmm0, xmmword ptr [rsp+0x40]
587 paddd xmm1, xmmword ptr [rsp+0xC0]
588 paddd xmm2, xmmword ptr [rsp+0x20]
589 paddd xmm3, xmmword ptr [rsp+0xE0]
590 paddd xmm0, xmm4
591 paddd xmm1, xmm5
592 paddd xmm2, xmm6
593 paddd xmm3, xmm7
594 pxor xmm12, xmm0
595 pxor xmm13, xmm1
596 pxor xmm14, xmm2
597 pxor xmm15, xmm3
598 movdqa xmm8, xmmword ptr [ROT8+rip]
599 pshufb xmm12, xmm8
600 pshufb xmm13, xmm8
601 pshufb xmm14, xmm8
602 pshufb xmm15, xmm8
603 movdqa xmm8, xmmword ptr [rsp+0x100]
604 paddd xmm8, xmm12
605 paddd xmm9, xmm13
606 paddd xmm10, xmm14
607 paddd xmm11, xmm15
608 pxor xmm4, xmm8
609 pxor xmm5, xmm9
610 pxor xmm6, xmm10
611 pxor xmm7, xmm11
612 movdqa xmmword ptr [rsp+0x100], xmm8
613 movdqa xmm8, xmm4
614 psrld xmm8, 7
615 pslld xmm4, 25
616 por xmm4, xmm8
617 movdqa xmm8, xmm5
618 psrld xmm8, 7
619 pslld xmm5, 25
620 por xmm5, xmm8
621 movdqa xmm8, xmm6
622 psrld xmm8, 7
623 pslld xmm6, 25
624 por xmm6, xmm8
625 movdqa xmm8, xmm7
626 psrld xmm8, 7
627 pslld xmm7, 25
628 por xmm7, xmm8
629 paddd xmm0, xmmword ptr [rsp+0x60]
630 paddd xmm1, xmmword ptr [rsp+0x90]
631 paddd xmm2, xmmword ptr [rsp+0xB0]
632 paddd xmm3, xmmword ptr [rsp+0x80]
633 paddd xmm0, xmm5
634 paddd xmm1, xmm6
635 paddd xmm2, xmm7
636 paddd xmm3, xmm4
637 pxor xmm15, xmm0
638 pxor xmm12, xmm1
639 pxor xmm13, xmm2
640 pxor xmm14, xmm3
641 movdqa xmm8, xmmword ptr [ROT16+rip]
642 pshufb xmm15, xmm8
643 pshufb xmm12, xmm8
644 pshufb xmm13, xmm8
645 pshufb xmm14, xmm8
646 paddd xmm10, xmm15
647 paddd xmm11, xmm12
648 movdqa xmm8, xmmword ptr [rsp+0x100]
649 paddd xmm8, xmm13
650 paddd xmm9, xmm14
651 pxor xmm5, xmm10
652 pxor xmm6, xmm11
653 pxor xmm7, xmm8
654 pxor xmm4, xmm9
655 movdqa xmmword ptr [rsp+0x100], xmm8
656 movdqa xmm8, xmm5
657 psrld xmm8, 12
658 pslld xmm5, 20
659 por xmm5, xmm8
660 movdqa xmm8, xmm6
661 psrld xmm8, 12
662 pslld xmm6, 20
663 por xmm6, xmm8
664 movdqa xmm8, xmm7
665 psrld xmm8, 12
666 pslld xmm7, 20
667 por xmm7, xmm8
668 movdqa xmm8, xmm4
669 psrld xmm8, 12
670 pslld xmm4, 20
671 por xmm4, xmm8
672 paddd xmm0, xmmword ptr [rsp+0x50]
673 paddd xmm1, xmmword ptr [rsp]
674 paddd xmm2, xmmword ptr [rsp+0xF0]
675 paddd xmm3, xmmword ptr [rsp+0x10]
676 paddd xmm0, xmm5
677 paddd xmm1, xmm6
678 paddd xmm2, xmm7
679 paddd xmm3, xmm4
680 pxor xmm15, xmm0
681 pxor xmm12, xmm1
682 pxor xmm13, xmm2
683 pxor xmm14, xmm3
684 movdqa xmm8, xmmword ptr [ROT8+rip]
685 pshufb xmm15, xmm8
686 pshufb xmm12, xmm8
687 pshufb xmm13, xmm8
688 pshufb xmm14, xmm8
689 paddd xmm10, xmm15
690 paddd xmm11, xmm12
691 movdqa xmm8, xmmword ptr [rsp+0x100]
692 paddd xmm8, xmm13
693 paddd xmm9, xmm14
694 pxor xmm5, xmm10
695 pxor xmm6, xmm11
696 pxor xmm7, xmm8
697 pxor xmm4, xmm9
698 movdqa xmmword ptr [rsp+0x100], xmm8
699 movdqa xmm8, xmm5
700 psrld xmm8, 7
701 pslld xmm5, 25
702 por xmm5, xmm8
703 movdqa xmm8, xmm6
704 psrld xmm8, 7
705 pslld xmm6, 25
706 por xmm6, xmm8
707 movdqa xmm8, xmm7
708 psrld xmm8, 7
709 pslld xmm7, 25
710 por xmm7, xmm8
711 movdqa xmm8, xmm4
712 psrld xmm8, 7
713 pslld xmm4, 25
714 por xmm4, xmm8
715 paddd xmm0, xmmword ptr [rsp+0xA0]
716 paddd xmm1, xmmword ptr [rsp+0xC0]
717 paddd xmm2, xmmword ptr [rsp+0xE0]
718 paddd xmm3, xmmword ptr [rsp+0xD0]
719 paddd xmm0, xmm4
720 paddd xmm1, xmm5
721 paddd xmm2, xmm6
722 paddd xmm3, xmm7
723 pxor xmm12, xmm0
724 pxor xmm13, xmm1
725 pxor xmm14, xmm2
726 pxor xmm15, xmm3
727 movdqa xmm8, xmmword ptr [ROT16+rip]
728 pshufb xmm12, xmm8
729 pshufb xmm13, xmm8
730 pshufb xmm14, xmm8
731 pshufb xmm15, xmm8
732 movdqa xmm8, xmmword ptr [rsp+0x100]
733 paddd xmm8, xmm12
734 paddd xmm9, xmm13
735 paddd xmm10, xmm14
736 paddd xmm11, xmm15
737 pxor xmm4, xmm8
738 pxor xmm5, xmm9
739 pxor xmm6, xmm10
740 pxor xmm7, xmm11
741 movdqa xmmword ptr [rsp+0x100], xmm8
742 movdqa xmm8, xmm4
743 psrld xmm8, 12
744 pslld xmm4, 20
745 por xmm4, xmm8
746 movdqa xmm8, xmm5
747 psrld xmm8, 12
748 pslld xmm5, 20
749 por xmm5, xmm8
750 movdqa xmm8, xmm6
751 psrld xmm8, 12
752 pslld xmm6, 20
753 por xmm6, xmm8
754 movdqa xmm8, xmm7
755 psrld xmm8, 12
756 pslld xmm7, 20
757 por xmm7, xmm8
758 paddd xmm0, xmmword ptr [rsp+0x70]
759 paddd xmm1, xmmword ptr [rsp+0x90]
760 paddd xmm2, xmmword ptr [rsp+0x30]
761 paddd xmm3, xmmword ptr [rsp+0xF0]
762 paddd xmm0, xmm4
763 paddd xmm1, xmm5
764 paddd xmm2, xmm6
765 paddd xmm3, xmm7
766 pxor xmm12, xmm0
767 pxor xmm13, xmm1
768 pxor xmm14, xmm2
769 pxor xmm15, xmm3
770 movdqa xmm8, xmmword ptr [ROT8+rip]
771 pshufb xmm12, xmm8
772 pshufb xmm13, xmm8
773 pshufb xmm14, xmm8
774 pshufb xmm15, xmm8
775 movdqa xmm8, xmmword ptr [rsp+0x100]
776 paddd xmm8, xmm12
777 paddd xmm9, xmm13
778 paddd xmm10, xmm14
779 paddd xmm11, xmm15
780 pxor xmm4, xmm8
781 pxor xmm5, xmm9
782 pxor xmm6, xmm10
783 pxor xmm7, xmm11
784 movdqa xmmword ptr [rsp+0x100], xmm8
785 movdqa xmm8, xmm4
786 psrld xmm8, 7
787 pslld xmm4, 25
788 por xmm4, xmm8
789 movdqa xmm8, xmm5
790 psrld xmm8, 7
791 pslld xmm5, 25
792 por xmm5, xmm8
793 movdqa xmm8, xmm6
794 psrld xmm8, 7
795 pslld xmm6, 25
796 por xmm6, xmm8
797 movdqa xmm8, xmm7
798 psrld xmm8, 7
799 pslld xmm7, 25
800 por xmm7, xmm8
801 paddd xmm0, xmmword ptr [rsp+0x40]
802 paddd xmm1, xmmword ptr [rsp+0xB0]
803 paddd xmm2, xmmword ptr [rsp+0x50]
804 paddd xmm3, xmmword ptr [rsp+0x10]
805 paddd xmm0, xmm5
806 paddd xmm1, xmm6
807 paddd xmm2, xmm7
808 paddd xmm3, xmm4
809 pxor xmm15, xmm0
810 pxor xmm12, xmm1
811 pxor xmm13, xmm2
812 pxor xmm14, xmm3
813 movdqa xmm8, xmmword ptr [ROT16+rip]
814 pshufb xmm15, xmm8
815 pshufb xmm12, xmm8
816 pshufb xmm13, xmm8
817 pshufb xmm14, xmm8
818 paddd xmm10, xmm15
819 paddd xmm11, xmm12
820 movdqa xmm8, xmmword ptr [rsp+0x100]
821 paddd xmm8, xmm13
822 paddd xmm9, xmm14
823 pxor xmm5, xmm10
824 pxor xmm6, xmm11
825 pxor xmm7, xmm8
826 pxor xmm4, xmm9
827 movdqa xmmword ptr [rsp+0x100], xmm8
828 movdqa xmm8, xmm5
829 psrld xmm8, 12
830 pslld xmm5, 20
831 por xmm5, xmm8
832 movdqa xmm8, xmm6
833 psrld xmm8, 12
834 pslld xmm6, 20
835 por xmm6, xmm8
836 movdqa xmm8, xmm7
837 psrld xmm8, 12
838 pslld xmm7, 20
839 por xmm7, xmm8
840 movdqa xmm8, xmm4
841 psrld xmm8, 12
842 pslld xmm4, 20
843 por xmm4, xmm8
844 paddd xmm0, xmmword ptr [rsp]
845 paddd xmm1, xmmword ptr [rsp+0x20]
846 paddd xmm2, xmmword ptr [rsp+0x80]
847 paddd xmm3, xmmword ptr [rsp+0x60]
848 paddd xmm0, xmm5
849 paddd xmm1, xmm6
850 paddd xmm2, xmm7
851 paddd xmm3, xmm4
852 pxor xmm15, xmm0
853 pxor xmm12, xmm1
854 pxor xmm13, xmm2
855 pxor xmm14, xmm3
856 movdqa xmm8, xmmword ptr [ROT8+rip]
857 pshufb xmm15, xmm8
858 pshufb xmm12, xmm8
859 pshufb xmm13, xmm8
860 pshufb xmm14, xmm8
861 paddd xmm10, xmm15
862 paddd xmm11, xmm12
863 movdqa xmm8, xmmword ptr [rsp+0x100]
864 paddd xmm8, xmm13
865 paddd xmm9, xmm14
866 pxor xmm5, xmm10
867 pxor xmm6, xmm11
868 pxor xmm7, xmm8
869 pxor xmm4, xmm9
870 movdqa xmmword ptr [rsp+0x100], xmm8
871 movdqa xmm8, xmm5
872 psrld xmm8, 7
873 pslld xmm5, 25
874 por xmm5, xmm8
875 movdqa xmm8, xmm6
876 psrld xmm8, 7
877 pslld xmm6, 25
878 por xmm6, xmm8
879 movdqa xmm8, xmm7
880 psrld xmm8, 7
881 pslld xmm7, 25
882 por xmm7, xmm8
883 movdqa xmm8, xmm4
884 psrld xmm8, 7
885 pslld xmm4, 25
886 por xmm4, xmm8
887 paddd xmm0, xmmword ptr [rsp+0xC0]
888 paddd xmm1, xmmword ptr [rsp+0x90]
889 paddd xmm2, xmmword ptr [rsp+0xF0]
890 paddd xmm3, xmmword ptr [rsp+0xE0]
891 paddd xmm0, xmm4
892 paddd xmm1, xmm5
893 paddd xmm2, xmm6
894 paddd xmm3, xmm7
895 pxor xmm12, xmm0
896 pxor xmm13, xmm1
897 pxor xmm14, xmm2
898 pxor xmm15, xmm3
899 movdqa xmm8, xmmword ptr [ROT16+rip]
900 pshufb xmm12, xmm8
901 pshufb xmm13, xmm8
902 pshufb xmm14, xmm8
903 pshufb xmm15, xmm8
904 movdqa xmm8, xmmword ptr [rsp+0x100]
905 paddd xmm8, xmm12
906 paddd xmm9, xmm13
907 paddd xmm10, xmm14
908 paddd xmm11, xmm15
909 pxor xmm4, xmm8
910 pxor xmm5, xmm9
911 pxor xmm6, xmm10
912 pxor xmm7, xmm11
913 movdqa xmmword ptr [rsp+0x100], xmm8
914 movdqa xmm8, xmm4
915 psrld xmm8, 12
916 pslld xmm4, 20
917 por xmm4, xmm8
918 movdqa xmm8, xmm5
919 psrld xmm8, 12
920 pslld xmm5, 20
921 por xmm5, xmm8
922 movdqa xmm8, xmm6
923 psrld xmm8, 12
924 pslld xmm6, 20
925 por xmm6, xmm8
926 movdqa xmm8, xmm7
927 psrld xmm8, 12
928 pslld xmm7, 20
929 por xmm7, xmm8
930 paddd xmm0, xmmword ptr [rsp+0xD0]
931 paddd xmm1, xmmword ptr [rsp+0xB0]
932 paddd xmm2, xmmword ptr [rsp+0xA0]
933 paddd xmm3, xmmword ptr [rsp+0x80]
934 paddd xmm0, xmm4
935 paddd xmm1, xmm5
936 paddd xmm2, xmm6
937 paddd xmm3, xmm7
938 pxor xmm12, xmm0
939 pxor xmm13, xmm1
940 pxor xmm14, xmm2
941 pxor xmm15, xmm3
942 movdqa xmm8, xmmword ptr [ROT8+rip]
943 pshufb xmm12, xmm8
944 pshufb xmm13, xmm8
945 pshufb xmm14, xmm8
946 pshufb xmm15, xmm8
947 movdqa xmm8, xmmword ptr [rsp+0x100]
948 paddd xmm8, xmm12
949 paddd xmm9, xmm13
950 paddd xmm10, xmm14
951 paddd xmm11, xmm15
952 pxor xmm4, xmm8
953 pxor xmm5, xmm9
954 pxor xmm6, xmm10
955 pxor xmm7, xmm11
956 movdqa xmmword ptr [rsp+0x100], xmm8
957 movdqa xmm8, xmm4
958 psrld xmm8, 7
959 pslld xmm4, 25
960 por xmm4, xmm8
961 movdqa xmm8, xmm5
962 psrld xmm8, 7
963 pslld xmm5, 25
964 por xmm5, xmm8
965 movdqa xmm8, xmm6
966 psrld xmm8, 7
967 pslld xmm6, 25
968 por xmm6, xmm8
969 movdqa xmm8, xmm7
970 psrld xmm8, 7
971 pslld xmm7, 25
972 por xmm7, xmm8
973 paddd xmm0, xmmword ptr [rsp+0x70]
974 paddd xmm1, xmmword ptr [rsp+0x50]
975 paddd xmm2, xmmword ptr [rsp]
976 paddd xmm3, xmmword ptr [rsp+0x60]
977 paddd xmm0, xmm5
978 paddd xmm1, xmm6
979 paddd xmm2, xmm7
980 paddd xmm3, xmm4
981 pxor xmm15, xmm0
982 pxor xmm12, xmm1
983 pxor xmm13, xmm2
984 pxor xmm14, xmm3
985 movdqa xmm8, xmmword ptr [ROT16+rip]
986 pshufb xmm15, xmm8
987 pshufb xmm12, xmm8
988 pshufb xmm13, xmm8
989 pshufb xmm14, xmm8
990 paddd xmm10, xmm15
991 paddd xmm11, xmm12
992 movdqa xmm8, xmmword ptr [rsp+0x100]
993 paddd xmm8, xmm13
994 paddd xmm9, xmm14
995 pxor xmm5, xmm10
996 pxor xmm6, xmm11
997 pxor xmm7, xmm8
998 pxor xmm4, xmm9
999 movdqa xmmword ptr [rsp+0x100], xmm8
1000 movdqa xmm8, xmm5
1001 psrld xmm8, 12
1002 pslld xmm5, 20
1003 por xmm5, xmm8
1004 movdqa xmm8, xmm6
1005 psrld xmm8, 12
1006 pslld xmm6, 20
1007 por xmm6, xmm8
1008 movdqa xmm8, xmm7
1009 psrld xmm8, 12
1010 pslld xmm7, 20
1011 por xmm7, xmm8
1012 movdqa xmm8, xmm4
1013 psrld xmm8, 12
1014 pslld xmm4, 20
1015 por xmm4, xmm8
1016 paddd xmm0, xmmword ptr [rsp+0x20]
1017 paddd xmm1, xmmword ptr [rsp+0x30]
1018 paddd xmm2, xmmword ptr [rsp+0x10]
1019 paddd xmm3, xmmword ptr [rsp+0x40]
1020 paddd xmm0, xmm5
1021 paddd xmm1, xmm6
1022 paddd xmm2, xmm7
1023 paddd xmm3, xmm4
1024 pxor xmm15, xmm0
1025 pxor xmm12, xmm1
1026 pxor xmm13, xmm2
1027 pxor xmm14, xmm3
1028 movdqa xmm8, xmmword ptr [ROT8+rip]
1029 pshufb xmm15, xmm8
1030 pshufb xmm12, xmm8
1031 pshufb xmm13, xmm8
1032 pshufb xmm14, xmm8
1033 paddd xmm10, xmm15
1034 paddd xmm11, xmm12
1035 movdqa xmm8, xmmword ptr [rsp+0x100]
1036 paddd xmm8, xmm13
1037 paddd xmm9, xmm14
1038 pxor xmm5, xmm10
1039 pxor xmm6, xmm11
1040 pxor xmm7, xmm8
1041 pxor xmm4, xmm9
1042 movdqa xmmword ptr [rsp+0x100], xmm8
1043 movdqa xmm8, xmm5
1044 psrld xmm8, 7
1045 pslld xmm5, 25
1046 por xmm5, xmm8
1047 movdqa xmm8, xmm6
1048 psrld xmm8, 7
1049 pslld xmm6, 25
1050 por xmm6, xmm8
1051 movdqa xmm8, xmm7
1052 psrld xmm8, 7
1053 pslld xmm7, 25
1054 por xmm7, xmm8
1055 movdqa xmm8, xmm4
1056 psrld xmm8, 7
1057 pslld xmm4, 25
1058 por xmm4, xmm8
1059 paddd xmm0, xmmword ptr [rsp+0x90]
1060 paddd xmm1, xmmword ptr [rsp+0xB0]
1061 paddd xmm2, xmmword ptr [rsp+0x80]
1062 paddd xmm3, xmmword ptr [rsp+0xF0]
1063 paddd xmm0, xmm4
1064 paddd xmm1, xmm5
1065 paddd xmm2, xmm6
1066 paddd xmm3, xmm7
1067 pxor xmm12, xmm0
1068 pxor xmm13, xmm1
1069 pxor xmm14, xmm2
1070 pxor xmm15, xmm3
1071 movdqa xmm8, xmmword ptr [ROT16+rip]
1072 pshufb xmm12, xmm8
1073 pshufb xmm13, xmm8
1074 pshufb xmm14, xmm8
1075 pshufb xmm15, xmm8
1076 movdqa xmm8, xmmword ptr [rsp+0x100]
1077 paddd xmm8, xmm12
1078 paddd xmm9, xmm13
1079 paddd xmm10, xmm14
1080 paddd xmm11, xmm15
1081 pxor xmm4, xmm8
1082 pxor xmm5, xmm9
1083 pxor xmm6, xmm10
1084 pxor xmm7, xmm11
1085 movdqa xmmword ptr [rsp+0x100], xmm8
1086 movdqa xmm8, xmm4
1087 psrld xmm8, 12
1088 pslld xmm4, 20
1089 por xmm4, xmm8
1090 movdqa xmm8, xmm5
1091 psrld xmm8, 12
1092 pslld xmm5, 20
1093 por xmm5, xmm8
1094 movdqa xmm8, xmm6
1095 psrld xmm8, 12
1096 pslld xmm6, 20
1097 por xmm6, xmm8
1098 movdqa xmm8, xmm7
1099 psrld xmm8, 12
1100 pslld xmm7, 20
1101 por xmm7, xmm8
1102 paddd xmm0, xmmword ptr [rsp+0xE0]
1103 paddd xmm1, xmmword ptr [rsp+0x50]
1104 paddd xmm2, xmmword ptr [rsp+0xC0]
1105 paddd xmm3, xmmword ptr [rsp+0x10]
1106 paddd xmm0, xmm4
1107 paddd xmm1, xmm5
1108 paddd xmm2, xmm6
1109 paddd xmm3, xmm7
1110 pxor xmm12, xmm0
1111 pxor xmm13, xmm1
1112 pxor xmm14, xmm2
1113 pxor xmm15, xmm3
1114 movdqa xmm8, xmmword ptr [ROT8+rip]
1115 pshufb xmm12, xmm8
1116 pshufb xmm13, xmm8
1117 pshufb xmm14, xmm8
1118 pshufb xmm15, xmm8
1119 movdqa xmm8, xmmword ptr [rsp+0x100]
1120 paddd xmm8, xmm12
1121 paddd xmm9, xmm13
1122 paddd xmm10, xmm14
1123 paddd xmm11, xmm15
1124 pxor xmm4, xmm8
1125 pxor xmm5, xmm9
1126 pxor xmm6, xmm10
1127 pxor xmm7, xmm11
1128 movdqa xmmword ptr [rsp+0x100], xmm8
1129 movdqa xmm8, xmm4
1130 psrld xmm8, 7
1131 pslld xmm4, 25
1132 por xmm4, xmm8
1133 movdqa xmm8, xmm5
1134 psrld xmm8, 7
1135 pslld xmm5, 25
1136 por xmm5, xmm8
1137 movdqa xmm8, xmm6
1138 psrld xmm8, 7
1139 pslld xmm6, 25
1140 por xmm6, xmm8
1141 movdqa xmm8, xmm7
1142 psrld xmm8, 7
1143 pslld xmm7, 25
1144 por xmm7, xmm8
1145 paddd xmm0, xmmword ptr [rsp+0xD0]
1146 paddd xmm1, xmmword ptr [rsp]
1147 paddd xmm2, xmmword ptr [rsp+0x20]
1148 paddd xmm3, xmmword ptr [rsp+0x40]
1149 paddd xmm0, xmm5
1150 paddd xmm1, xmm6
1151 paddd xmm2, xmm7
1152 paddd xmm3, xmm4
1153 pxor xmm15, xmm0
1154 pxor xmm12, xmm1
1155 pxor xmm13, xmm2
1156 pxor xmm14, xmm3
1157 movdqa xmm8, xmmword ptr [ROT16+rip]
1158 pshufb xmm15, xmm8
1159 pshufb xmm12, xmm8
1160 pshufb xmm13, xmm8
1161 pshufb xmm14, xmm8
1162 paddd xmm10, xmm15
1163 paddd xmm11, xmm12
1164 movdqa xmm8, xmmword ptr [rsp+0x100]
1165 paddd xmm8, xmm13
1166 paddd xmm9, xmm14
1167 pxor xmm5, xmm10
1168 pxor xmm6, xmm11
1169 pxor xmm7, xmm8
1170 pxor xmm4, xmm9
1171 movdqa xmmword ptr [rsp+0x100], xmm8
1172 movdqa xmm8, xmm5
1173 psrld xmm8, 12
1174 pslld xmm5, 20
1175 por xmm5, xmm8
1176 movdqa xmm8, xmm6
1177 psrld xmm8, 12
1178 pslld xmm6, 20
1179 por xmm6, xmm8
1180 movdqa xmm8, xmm7
1181 psrld xmm8, 12
1182 pslld xmm7, 20
1183 por xmm7, xmm8
1184 movdqa xmm8, xmm4
1185 psrld xmm8, 12
1186 pslld xmm4, 20
1187 por xmm4, xmm8
1188 paddd xmm0, xmmword ptr [rsp+0x30]
1189 paddd xmm1, xmmword ptr [rsp+0xA0]
1190 paddd xmm2, xmmword ptr [rsp+0x60]
1191 paddd xmm3, xmmword ptr [rsp+0x70]
1192 paddd xmm0, xmm5
1193 paddd xmm1, xmm6
1194 paddd xmm2, xmm7
1195 paddd xmm3, xmm4
1196 pxor xmm15, xmm0
1197 pxor xmm12, xmm1
1198 pxor xmm13, xmm2
1199 pxor xmm14, xmm3
1200 movdqa xmm8, xmmword ptr [ROT8+rip]
1201 pshufb xmm15, xmm8
1202 pshufb xmm12, xmm8
1203 pshufb xmm13, xmm8
1204 pshufb xmm14, xmm8
1205 paddd xmm10, xmm15
1206 paddd xmm11, xmm12
1207 movdqa xmm8, xmmword ptr [rsp+0x100]
1208 paddd xmm8, xmm13
1209 paddd xmm9, xmm14
1210 pxor xmm5, xmm10
1211 pxor xmm6, xmm11
1212 pxor xmm7, xmm8
1213 pxor xmm4, xmm9
1214 movdqa xmmword ptr [rsp+0x100], xmm8
1215 movdqa xmm8, xmm5
1216 psrld xmm8, 7
1217 pslld xmm5, 25
1218 por xmm5, xmm8
1219 movdqa xmm8, xmm6
1220 psrld xmm8, 7
1221 pslld xmm6, 25
1222 por xmm6, xmm8
1223 movdqa xmm8, xmm7
1224 psrld xmm8, 7
1225 pslld xmm7, 25
1226 por xmm7, xmm8
1227 movdqa xmm8, xmm4
1228 psrld xmm8, 7
1229 pslld xmm4, 25
1230 por xmm4, xmm8
1231 paddd xmm0, xmmword ptr [rsp+0xB0]
1232 paddd xmm1, xmmword ptr [rsp+0x50]
1233 paddd xmm2, xmmword ptr [rsp+0x10]
1234 paddd xmm3, xmmword ptr [rsp+0x80]
1235 paddd xmm0, xmm4
1236 paddd xmm1, xmm5
1237 paddd xmm2, xmm6
1238 paddd xmm3, xmm7
1239 pxor xmm12, xmm0
1240 pxor xmm13, xmm1
1241 pxor xmm14, xmm2
1242 pxor xmm15, xmm3
1243 movdqa xmm8, xmmword ptr [ROT16+rip]
1244 pshufb xmm12, xmm8
1245 pshufb xmm13, xmm8
1246 pshufb xmm14, xmm8
1247 pshufb xmm15, xmm8
1248 movdqa xmm8, xmmword ptr [rsp+0x100]
1249 paddd xmm8, xmm12
1250 paddd xmm9, xmm13
1251 paddd xmm10, xmm14
1252 paddd xmm11, xmm15
1253 pxor xmm4, xmm8
1254 pxor xmm5, xmm9
1255 pxor xmm6, xmm10
1256 pxor xmm7, xmm11
1257 movdqa xmmword ptr [rsp+0x100], xmm8
1258 movdqa xmm8, xmm4
1259 psrld xmm8, 12
1260 pslld xmm4, 20
1261 por xmm4, xmm8
1262 movdqa xmm8, xmm5
1263 psrld xmm8, 12
1264 pslld xmm5, 20
1265 por xmm5, xmm8
1266 movdqa xmm8, xmm6
1267 psrld xmm8, 12
1268 pslld xmm6, 20
1269 por xmm6, xmm8
1270 movdqa xmm8, xmm7
1271 psrld xmm8, 12
1272 pslld xmm7, 20
1273 por xmm7, xmm8
1274 paddd xmm0, xmmword ptr [rsp+0xF0]
1275 paddd xmm1, xmmword ptr [rsp]
1276 paddd xmm2, xmmword ptr [rsp+0x90]
1277 paddd xmm3, xmmword ptr [rsp+0x60]
1278 paddd xmm0, xmm4
1279 paddd xmm1, xmm5
1280 paddd xmm2, xmm6
1281 paddd xmm3, xmm7
1282 pxor xmm12, xmm0
1283 pxor xmm13, xmm1
1284 pxor xmm14, xmm2
1285 pxor xmm15, xmm3
1286 movdqa xmm8, xmmword ptr [ROT8+rip]
1287 pshufb xmm12, xmm8
1288 pshufb xmm13, xmm8
1289 pshufb xmm14, xmm8
1290 pshufb xmm15, xmm8
1291 movdqa xmm8, xmmword ptr [rsp+0x100]
1292 paddd xmm8, xmm12
1293 paddd xmm9, xmm13
1294 paddd xmm10, xmm14
1295 paddd xmm11, xmm15
1296 pxor xmm4, xmm8
1297 pxor xmm5, xmm9
1298 pxor xmm6, xmm10
1299 pxor xmm7, xmm11
1300 movdqa xmmword ptr [rsp+0x100], xmm8
1301 movdqa xmm8, xmm4
1302 psrld xmm8, 7
1303 pslld xmm4, 25
1304 por xmm4, xmm8
1305 movdqa xmm8, xmm5
1306 psrld xmm8, 7
1307 pslld xmm5, 25
1308 por xmm5, xmm8
1309 movdqa xmm8, xmm6
1310 psrld xmm8, 7
1311 pslld xmm6, 25
1312 por xmm6, xmm8
1313 movdqa xmm8, xmm7
1314 psrld xmm8, 7
1315 pslld xmm7, 25
1316 por xmm7, xmm8
1317 paddd xmm0, xmmword ptr [rsp+0xE0]
1318 paddd xmm1, xmmword ptr [rsp+0x20]
1319 paddd xmm2, xmmword ptr [rsp+0x30]
1320 paddd xmm3, xmmword ptr [rsp+0x70]
1321 paddd xmm0, xmm5
1322 paddd xmm1, xmm6
1323 paddd xmm2, xmm7
1324 paddd xmm3, xmm4
1325 pxor xmm15, xmm0
1326 pxor xmm12, xmm1
1327 pxor xmm13, xmm2
1328 pxor xmm14, xmm3
1329 movdqa xmm8, xmmword ptr [ROT16+rip]
1330 pshufb xmm15, xmm8
1331 pshufb xmm12, xmm8
1332 pshufb xmm13, xmm8
1333 pshufb xmm14, xmm8
1334 paddd xmm10, xmm15
1335 paddd xmm11, xmm12
1336 movdqa xmm8, xmmword ptr [rsp+0x100]
1337 paddd xmm8, xmm13
1338 paddd xmm9, xmm14
1339 pxor xmm5, xmm10
1340 pxor xmm6, xmm11
1341 pxor xmm7, xmm8
1342 pxor xmm4, xmm9
1343 movdqa xmmword ptr [rsp+0x100], xmm8
1344 movdqa xmm8, xmm5
1345 psrld xmm8, 12
1346 pslld xmm5, 20
1347 por xmm5, xmm8
1348 movdqa xmm8, xmm6
1349 psrld xmm8, 12
1350 pslld xmm6, 20
1351 por xmm6, xmm8
1352 movdqa xmm8, xmm7
1353 psrld xmm8, 12
1354 pslld xmm7, 20
1355 por xmm7, xmm8
1356 movdqa xmm8, xmm4
1357 psrld xmm8, 12
1358 pslld xmm4, 20
1359 por xmm4, xmm8
1360 paddd xmm0, xmmword ptr [rsp+0xA0]
1361 paddd xmm1, xmmword ptr [rsp+0xC0]
1362 paddd xmm2, xmmword ptr [rsp+0x40]
1363 paddd xmm3, xmmword ptr [rsp+0xD0]
1364 paddd xmm0, xmm5
1365 paddd xmm1, xmm6
1366 paddd xmm2, xmm7
1367 paddd xmm3, xmm4
1368 pxor xmm15, xmm0
1369 pxor xmm12, xmm1
1370 pxor xmm13, xmm2
1371 pxor xmm14, xmm3
1372 movdqa xmm8, xmmword ptr [ROT8+rip]
1373 pshufb xmm15, xmm8
1374 pshufb xmm12, xmm8
1375 pshufb xmm13, xmm8
1376 pshufb xmm14, xmm8
1377 paddd xmm10, xmm15
1378 paddd xmm11, xmm12
1379 movdqa xmm8, xmmword ptr [rsp+0x100]
1380 paddd xmm8, xmm13
1381 paddd xmm9, xmm14
1382 pxor xmm5, xmm10
1383 pxor xmm6, xmm11
1384 pxor xmm7, xmm8
1385 pxor xmm4, xmm9
1386 pxor xmm0, xmm8
1387 pxor xmm1, xmm9
1388 pxor xmm2, xmm10
1389 pxor xmm3, xmm11
1390 movdqa xmm8, xmm5
1391 psrld xmm8, 7
1392 pslld xmm5, 25
1393 por xmm5, xmm8
1394 movdqa xmm8, xmm6
1395 psrld xmm8, 7
1396 pslld xmm6, 25
1397 por xmm6, xmm8
1398 movdqa xmm8, xmm7
1399 psrld xmm8, 7
1400 pslld xmm7, 25
1401 por xmm7, xmm8
1402 movdqa xmm8, xmm4
1403 psrld xmm8, 7
1404 pslld xmm4, 25
1405 por xmm4, xmm8
1406 pxor xmm4, xmm12
1407 pxor xmm5, xmm13
1408 pxor xmm6, xmm14
1409 pxor xmm7, xmm15
1410 mov eax, r13d
1411 jne 9b
1412 movdqa xmm9, xmm0
1413 punpckldq xmm0, xmm1
1414 punpckhdq xmm9, xmm1
1415 movdqa xmm11, xmm2
1416 punpckldq xmm2, xmm3
1417 punpckhdq xmm11, xmm3
1418 movdqa xmm1, xmm0
1419 punpcklqdq xmm0, xmm2
1420 punpckhqdq xmm1, xmm2
1421 movdqa xmm3, xmm9
1422 punpcklqdq xmm9, xmm11
1423 punpckhqdq xmm3, xmm11
1424 movdqu xmmword ptr [rbx], xmm0
1425 movdqu xmmword ptr [rbx+0x20], xmm1
1426 movdqu xmmword ptr [rbx+0x40], xmm9
1427 movdqu xmmword ptr [rbx+0x60], xmm3
1428 movdqa xmm9, xmm4
1429 punpckldq xmm4, xmm5
1430 punpckhdq xmm9, xmm5
1431 movdqa xmm11, xmm6
1432 punpckldq xmm6, xmm7
1433 punpckhdq xmm11, xmm7
1434 movdqa xmm5, xmm4
1435 punpcklqdq xmm4, xmm6
1436 punpckhqdq xmm5, xmm6
1437 movdqa xmm7, xmm9
1438 punpcklqdq xmm9, xmm11
1439 punpckhqdq xmm7, xmm11
1440 movdqu xmmword ptr [rbx+0x10], xmm4
1441 movdqu xmmword ptr [rbx+0x30], xmm5
1442 movdqu xmmword ptr [rbx+0x50], xmm9
1443 movdqu xmmword ptr [rbx+0x70], xmm7
1444 movdqa xmm1, xmmword ptr [rsp+0x110]
1445 movdqa xmm0, xmm1
1446 paddd xmm1, xmmword ptr [rsp+0x150]
1447 movdqa xmmword ptr [rsp+0x110], xmm1
1448 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1449 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1450 pcmpgtd xmm0, xmm1
1451 movdqa xmm1, xmmword ptr [rsp+0x120]
1452 psubd xmm1, xmm0
1453 movdqa xmmword ptr [rsp+0x120], xmm1
1454 add rbx, 128
1455 add rdi, 32
1456 sub rsi, 4
1457 cmp rsi, 4
1458 jnc 2b
1459 test rsi, rsi
1460 jnz 3f
14614:
1462 mov rsp, rbp
1463 pop rbp
1464 pop rbx
1465 pop r12
1466 pop r13
1467 pop r14
1468 pop r15
1469 ret
1470.p2align 5
14713:
1472 test esi, 0x2
1473 je 3f
1474 movups xmm0, xmmword ptr [rcx]
1475 movups xmm1, xmmword ptr [rcx+0x10]
1476 movaps xmm8, xmm0
1477 movaps xmm9, xmm1
1478 movd xmm13, dword ptr [rsp+0x110]
1479 pinsrd xmm13, dword ptr [rsp+0x120], 1
1480 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1481 movaps xmmword ptr [rsp], xmm13
1482 movd xmm14, dword ptr [rsp+0x114]
1483 pinsrd xmm14, dword ptr [rsp+0x124], 1
1484 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1485 movaps xmmword ptr [rsp+0x10], xmm14
1486 mov r8, qword ptr [rdi]
1487 mov r9, qword ptr [rdi+0x8]
1488 movzx eax, byte ptr [rbp+0x40]
1489 or eax, r13d
1490 xor edx, edx
14912:
1492 mov r14d, eax
1493 or eax, r12d
1494 add rdx, 64
1495 cmp rdx, r15
1496 cmovne eax, r14d
1497 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1498 movaps xmm10, xmm2
1499 movups xmm4, xmmword ptr [r8+rdx-0x40]
1500 movups xmm5, xmmword ptr [r8+rdx-0x30]
1501 movaps xmm3, xmm4
1502 shufps xmm4, xmm5, 136
1503 shufps xmm3, xmm5, 221
1504 movaps xmm5, xmm3
1505 movups xmm6, xmmword ptr [r8+rdx-0x20]
1506 movups xmm7, xmmword ptr [r8+rdx-0x10]
1507 movaps xmm3, xmm6
1508 shufps xmm6, xmm7, 136
1509 pshufd xmm6, xmm6, 0x93
1510 shufps xmm3, xmm7, 221
1511 pshufd xmm7, xmm3, 0x93
1512 movups xmm12, xmmword ptr [r9+rdx-0x40]
1513 movups xmm13, xmmword ptr [r9+rdx-0x30]
1514 movaps xmm11, xmm12
1515 shufps xmm12, xmm13, 136
1516 shufps xmm11, xmm13, 221
1517 movaps xmm13, xmm11
1518 movups xmm14, xmmword ptr [r9+rdx-0x20]
1519 movups xmm15, xmmword ptr [r9+rdx-0x10]
1520 movaps xmm11, xmm14
1521 shufps xmm14, xmm15, 136
1522 pshufd xmm14, xmm14, 0x93
1523 shufps xmm11, xmm15, 221
1524 pshufd xmm15, xmm11, 0x93
1525 movaps xmm3, xmmword ptr [rsp]
1526 movaps xmm11, xmmword ptr [rsp+0x10]
1527 pinsrd xmm3, eax, 3
1528 pinsrd xmm11, eax, 3
1529 mov al, 7
15309:
1531 paddd xmm0, xmm4
1532 paddd xmm8, xmm12
1533 movaps xmmword ptr [rsp+0x20], xmm4
1534 movaps xmmword ptr [rsp+0x30], xmm12
1535 paddd xmm0, xmm1
1536 paddd xmm8, xmm9
1537 pxor xmm3, xmm0
1538 pxor xmm11, xmm8
1539 movaps xmm12, xmmword ptr [ROT16+rip]
1540 pshufb xmm3, xmm12
1541 pshufb xmm11, xmm12
1542 paddd xmm2, xmm3
1543 paddd xmm10, xmm11
1544 pxor xmm1, xmm2
1545 pxor xmm9, xmm10
1546 movdqa xmm4, xmm1
1547 pslld xmm1, 20
1548 psrld xmm4, 12
1549 por xmm1, xmm4
1550 movdqa xmm4, xmm9
1551 pslld xmm9, 20
1552 psrld xmm4, 12
1553 por xmm9, xmm4
1554 paddd xmm0, xmm5
1555 paddd xmm8, xmm13
1556 movaps xmmword ptr [rsp+0x40], xmm5
1557 movaps xmmword ptr [rsp+0x50], xmm13
1558 paddd xmm0, xmm1
1559 paddd xmm8, xmm9
1560 pxor xmm3, xmm0
1561 pxor xmm11, xmm8
1562 movaps xmm13, xmmword ptr [ROT8+rip]
1563 pshufb xmm3, xmm13
1564 pshufb xmm11, xmm13
1565 paddd xmm2, xmm3
1566 paddd xmm10, xmm11
1567 pxor xmm1, xmm2
1568 pxor xmm9, xmm10
1569 movdqa xmm4, xmm1
1570 pslld xmm1, 25
1571 psrld xmm4, 7
1572 por xmm1, xmm4
1573 movdqa xmm4, xmm9
1574 pslld xmm9, 25
1575 psrld xmm4, 7
1576 por xmm9, xmm4
1577 pshufd xmm0, xmm0, 0x93
1578 pshufd xmm8, xmm8, 0x93
1579 pshufd xmm3, xmm3, 0x4E
1580 pshufd xmm11, xmm11, 0x4E
1581 pshufd xmm2, xmm2, 0x39
1582 pshufd xmm10, xmm10, 0x39
1583 paddd xmm0, xmm6
1584 paddd xmm8, xmm14
1585 paddd xmm0, xmm1
1586 paddd xmm8, xmm9
1587 pxor xmm3, xmm0
1588 pxor xmm11, xmm8
1589 pshufb xmm3, xmm12
1590 pshufb xmm11, xmm12
1591 paddd xmm2, xmm3
1592 paddd xmm10, xmm11
1593 pxor xmm1, xmm2
1594 pxor xmm9, xmm10
1595 movdqa xmm4, xmm1
1596 pslld xmm1, 20
1597 psrld xmm4, 12
1598 por xmm1, xmm4
1599 movdqa xmm4, xmm9
1600 pslld xmm9, 20
1601 psrld xmm4, 12
1602 por xmm9, xmm4
1603 paddd xmm0, xmm7
1604 paddd xmm8, xmm15
1605 paddd xmm0, xmm1
1606 paddd xmm8, xmm9
1607 pxor xmm3, xmm0
1608 pxor xmm11, xmm8
1609 pshufb xmm3, xmm13
1610 pshufb xmm11, xmm13
1611 paddd xmm2, xmm3
1612 paddd xmm10, xmm11
1613 pxor xmm1, xmm2
1614 pxor xmm9, xmm10
1615 movdqa xmm4, xmm1
1616 pslld xmm1, 25
1617 psrld xmm4, 7
1618 por xmm1, xmm4
1619 movdqa xmm4, xmm9
1620 pslld xmm9, 25
1621 psrld xmm4, 7
1622 por xmm9, xmm4
1623 pshufd xmm0, xmm0, 0x39
1624 pshufd xmm8, xmm8, 0x39
1625 pshufd xmm3, xmm3, 0x4E
1626 pshufd xmm11, xmm11, 0x4E
1627 pshufd xmm2, xmm2, 0x93
1628 pshufd xmm10, xmm10, 0x93
1629 dec al
1630 je 9f
1631 movdqa xmm12, xmmword ptr [rsp+0x20]
1632 movdqa xmm5, xmmword ptr [rsp+0x40]
1633 pshufd xmm13, xmm12, 0x0F
1634 shufps xmm12, xmm5, 214
1635 pshufd xmm4, xmm12, 0x39
1636 movdqa xmm12, xmm6
1637 shufps xmm12, xmm7, 250
1638 pblendw xmm13, xmm12, 0xCC
1639 movdqa xmm12, xmm7
1640 punpcklqdq xmm12, xmm5
1641 pblendw xmm12, xmm6, 0xC0
1642 pshufd xmm12, xmm12, 0x78
1643 punpckhdq xmm5, xmm7
1644 punpckldq xmm6, xmm5
1645 pshufd xmm7, xmm6, 0x1E
1646 movdqa xmmword ptr [rsp+0x20], xmm13
1647 movdqa xmmword ptr [rsp+0x40], xmm12
1648 movdqa xmm5, xmmword ptr [rsp+0x30]
1649 movdqa xmm13, xmmword ptr [rsp+0x50]
1650 pshufd xmm6, xmm5, 0x0F
1651 shufps xmm5, xmm13, 214
1652 pshufd xmm12, xmm5, 0x39
1653 movdqa xmm5, xmm14
1654 shufps xmm5, xmm15, 250
1655 pblendw xmm6, xmm5, 0xCC
1656 movdqa xmm5, xmm15
1657 punpcklqdq xmm5, xmm13
1658 pblendw xmm5, xmm14, 0xC0
1659 pshufd xmm5, xmm5, 0x78
1660 punpckhdq xmm13, xmm15
1661 punpckldq xmm14, xmm13
1662 pshufd xmm15, xmm14, 0x1E
1663 movdqa xmm13, xmm6
1664 movdqa xmm14, xmm5
1665 movdqa xmm5, xmmword ptr [rsp+0x20]
1666 movdqa xmm6, xmmword ptr [rsp+0x40]
1667 jmp 9b
16689:
1669 pxor xmm0, xmm2
1670 pxor xmm1, xmm3
1671 pxor xmm8, xmm10
1672 pxor xmm9, xmm11
1673 mov eax, r13d
1674 cmp rdx, r15
1675 jne 2b
1676 movups xmmword ptr [rbx], xmm0
1677 movups xmmword ptr [rbx+0x10], xmm1
1678 movups xmmword ptr [rbx+0x20], xmm8
1679 movups xmmword ptr [rbx+0x30], xmm9
1680 movdqa xmm0, xmmword ptr [rsp+0x130]
1681 movdqa xmm1, xmmword ptr [rsp+0x110]
1682 movdqa xmm2, xmmword ptr [rsp+0x120]
1683 movdqu xmm3, xmmword ptr [rsp+0x118]
1684 movdqu xmm4, xmmword ptr [rsp+0x128]
1685 blendvps xmm1, xmm3, xmm0
1686 blendvps xmm2, xmm4, xmm0
1687 movdqa xmmword ptr [rsp+0x110], xmm1
1688 movdqa xmmword ptr [rsp+0x120], xmm2
1689 add rdi, 16
1690 add rbx, 64
1691 sub rsi, 2
16923:
1693 test esi, 0x1
1694 je 4b
1695 movups xmm0, xmmword ptr [rcx]
1696 movups xmm1, xmmword ptr [rcx+0x10]
1697 movd xmm13, dword ptr [rsp+0x110]
1698 pinsrd xmm13, dword ptr [rsp+0x120], 1
1699 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1700 movaps xmm14, xmmword ptr [ROT8+rip]
1701 movaps xmm15, xmmword ptr [ROT16+rip]
1702 mov r8, qword ptr [rdi]
1703 movzx eax, byte ptr [rbp+0x40]
1704 or eax, r13d
1705 xor edx, edx
17062:
1707 mov r14d, eax
1708 or eax, r12d
1709 add rdx, 64
1710 cmp rdx, r15
1711 cmovne eax, r14d
1712 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1713 movaps xmm3, xmm13
1714 pinsrd xmm3, eax, 3
1715 movups xmm4, xmmword ptr [r8+rdx-0x40]
1716 movups xmm5, xmmword ptr [r8+rdx-0x30]
1717 movaps xmm8, xmm4
1718 shufps xmm4, xmm5, 136
1719 shufps xmm8, xmm5, 221
1720 movaps xmm5, xmm8
1721 movups xmm6, xmmword ptr [r8+rdx-0x20]
1722 movups xmm7, xmmword ptr [r8+rdx-0x10]
1723 movaps xmm8, xmm6
1724 shufps xmm6, xmm7, 136
1725 pshufd xmm6, xmm6, 0x93
1726 shufps xmm8, xmm7, 221
1727 pshufd xmm7, xmm8, 0x93
1728 mov al, 7
17299:
1730 paddd xmm0, xmm4
1731 paddd xmm0, xmm1
1732 pxor xmm3, xmm0
1733 pshufb xmm3, xmm15
1734 paddd xmm2, xmm3
1735 pxor xmm1, xmm2
1736 movdqa xmm11, xmm1
1737 pslld xmm1, 20
1738 psrld xmm11, 12
1739 por xmm1, xmm11
1740 paddd xmm0, xmm5
1741 paddd xmm0, xmm1
1742 pxor xmm3, xmm0
1743 pshufb xmm3, xmm14
1744 paddd xmm2, xmm3
1745 pxor xmm1, xmm2
1746 movdqa xmm11, xmm1
1747 pslld xmm1, 25
1748 psrld xmm11, 7
1749 por xmm1, xmm11
1750 pshufd xmm0, xmm0, 0x93
1751 pshufd xmm3, xmm3, 0x4E
1752 pshufd xmm2, xmm2, 0x39
1753 paddd xmm0, xmm6
1754 paddd xmm0, xmm1
1755 pxor xmm3, xmm0
1756 pshufb xmm3, xmm15
1757 paddd xmm2, xmm3
1758 pxor xmm1, xmm2
1759 movdqa xmm11, xmm1
1760 pslld xmm1, 20
1761 psrld xmm11, 12
1762 por xmm1, xmm11
1763 paddd xmm0, xmm7
1764 paddd xmm0, xmm1
1765 pxor xmm3, xmm0
1766 pshufb xmm3, xmm14
1767 paddd xmm2, xmm3
1768 pxor xmm1, xmm2
1769 movdqa xmm11, xmm1
1770 pslld xmm1, 25
1771 psrld xmm11, 7
1772 por xmm1, xmm11
1773 pshufd xmm0, xmm0, 0x39
1774 pshufd xmm3, xmm3, 0x4E
1775 pshufd xmm2, xmm2, 0x93
1776 dec al
1777 jz 9f
1778 movdqa xmm8, xmm4
1779 shufps xmm8, xmm5, 214
1780 pshufd xmm9, xmm4, 0x0F
1781 pshufd xmm4, xmm8, 0x39
1782 movdqa xmm8, xmm6
1783 shufps xmm8, xmm7, 250
1784 pblendw xmm9, xmm8, 0xCC
1785 movdqa xmm8, xmm7
1786 punpcklqdq xmm8, xmm5
1787 pblendw xmm8, xmm6, 0xC0
1788 pshufd xmm8, xmm8, 0x78
1789 punpckhdq xmm5, xmm7
1790 punpckldq xmm6, xmm5
1791 pshufd xmm7, xmm6, 0x1E
1792 movdqa xmm5, xmm9
1793 movdqa xmm6, xmm8
1794 jmp 9b
17959:
1796 pxor xmm0, xmm2
1797 pxor xmm1, xmm3
1798 mov eax, r13d
1799 cmp rdx, r15
1800 jne 2b
1801 movups xmmword ptr [rbx], xmm0
1802 movups xmmword ptr [rbx+0x10], xmm1
1803 jmp 4b
1804
1805.p2align 6
1806blake3_compress_in_place_sse41:
1807_blake3_compress_in_place_sse41:
1808 _CET_ENDBR
1809 movups xmm0, xmmword ptr [rdi]
1810 movups xmm1, xmmword ptr [rdi+0x10]
1811 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1812 shl r8, 32
1813 add rdx, r8
1814 movq xmm3, rcx
1815 movq xmm4, rdx
1816 punpcklqdq xmm3, xmm4
1817 movups xmm4, xmmword ptr [rsi]
1818 movups xmm5, xmmword ptr [rsi+0x10]
1819 movaps xmm8, xmm4
1820 shufps xmm4, xmm5, 136
1821 shufps xmm8, xmm5, 221
1822 movaps xmm5, xmm8
1823 movups xmm6, xmmword ptr [rsi+0x20]
1824 movups xmm7, xmmword ptr [rsi+0x30]
1825 movaps xmm8, xmm6
1826 shufps xmm6, xmm7, 136
1827 pshufd xmm6, xmm6, 0x93
1828 shufps xmm8, xmm7, 221
1829 pshufd xmm7, xmm8, 0x93
1830 movaps xmm14, xmmword ptr [ROT8+rip]
1831 movaps xmm15, xmmword ptr [ROT16+rip]
1832 mov al, 7
18339:
1834 paddd xmm0, xmm4
1835 paddd xmm0, xmm1
1836 pxor xmm3, xmm0
1837 pshufb xmm3, xmm15
1838 paddd xmm2, xmm3
1839 pxor xmm1, xmm2
1840 movdqa xmm11, xmm1
1841 pslld xmm1, 20
1842 psrld xmm11, 12
1843 por xmm1, xmm11
1844 paddd xmm0, xmm5
1845 paddd xmm0, xmm1
1846 pxor xmm3, xmm0
1847 pshufb xmm3, xmm14
1848 paddd xmm2, xmm3
1849 pxor xmm1, xmm2
1850 movdqa xmm11, xmm1
1851 pslld xmm1, 25
1852 psrld xmm11, 7
1853 por xmm1, xmm11
1854 pshufd xmm0, xmm0, 0x93
1855 pshufd xmm3, xmm3, 0x4E
1856 pshufd xmm2, xmm2, 0x39
1857 paddd xmm0, xmm6
1858 paddd xmm0, xmm1
1859 pxor xmm3, xmm0
1860 pshufb xmm3, xmm15
1861 paddd xmm2, xmm3
1862 pxor xmm1, xmm2
1863 movdqa xmm11, xmm1
1864 pslld xmm1, 20
1865 psrld xmm11, 12
1866 por xmm1, xmm11
1867 paddd xmm0, xmm7
1868 paddd xmm0, xmm1
1869 pxor xmm3, xmm0
1870 pshufb xmm3, xmm14
1871 paddd xmm2, xmm3
1872 pxor xmm1, xmm2
1873 movdqa xmm11, xmm1
1874 pslld xmm1, 25
1875 psrld xmm11, 7
1876 por xmm1, xmm11
1877 pshufd xmm0, xmm0, 0x39
1878 pshufd xmm3, xmm3, 0x4E
1879 pshufd xmm2, xmm2, 0x93
1880 dec al
1881 jz 9f
1882 movdqa xmm8, xmm4
1883 shufps xmm8, xmm5, 214
1884 pshufd xmm9, xmm4, 0x0F
1885 pshufd xmm4, xmm8, 0x39
1886 movdqa xmm8, xmm6
1887 shufps xmm8, xmm7, 250
1888 pblendw xmm9, xmm8, 0xCC
1889 movdqa xmm8, xmm7
1890 punpcklqdq xmm8, xmm5
1891 pblendw xmm8, xmm6, 0xC0
1892 pshufd xmm8, xmm8, 0x78
1893 punpckhdq xmm5, xmm7
1894 punpckldq xmm6, xmm5
1895 pshufd xmm7, xmm6, 0x1E
1896 movdqa xmm5, xmm9
1897 movdqa xmm6, xmm8
1898 jmp 9b
18999:
1900 pxor xmm0, xmm2
1901 pxor xmm1, xmm3
1902 movups xmmword ptr [rdi], xmm0
1903 movups xmmword ptr [rdi+0x10], xmm1
1904 ret
1905
1906.p2align 6
1907blake3_compress_xof_sse41:
1908_blake3_compress_xof_sse41:
1909 _CET_ENDBR
1910 movups xmm0, xmmword ptr [rdi]
1911 movups xmm1, xmmword ptr [rdi+0x10]
1912 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1913 movzx eax, r8b
1914 movzx edx, dl
1915 shl rax, 32
1916 add rdx, rax
1917 movq xmm3, rcx
1918 movq xmm4, rdx
1919 punpcklqdq xmm3, xmm4
1920 movups xmm4, xmmword ptr [rsi]
1921 movups xmm5, xmmword ptr [rsi+0x10]
1922 movaps xmm8, xmm4
1923 shufps xmm4, xmm5, 136
1924 shufps xmm8, xmm5, 221
1925 movaps xmm5, xmm8
1926 movups xmm6, xmmword ptr [rsi+0x20]
1927 movups xmm7, xmmword ptr [rsi+0x30]
1928 movaps xmm8, xmm6
1929 shufps xmm6, xmm7, 136
1930 pshufd xmm6, xmm6, 0x93
1931 shufps xmm8, xmm7, 221
1932 pshufd xmm7, xmm8, 0x93
1933 movaps xmm14, xmmword ptr [ROT8+rip]
1934 movaps xmm15, xmmword ptr [ROT16+rip]
1935 mov al, 7
19369:
1937 paddd xmm0, xmm4
1938 paddd xmm0, xmm1
1939 pxor xmm3, xmm0
1940 pshufb xmm3, xmm15
1941 paddd xmm2, xmm3
1942 pxor xmm1, xmm2
1943 movdqa xmm11, xmm1
1944 pslld xmm1, 20
1945 psrld xmm11, 12
1946 por xmm1, xmm11
1947 paddd xmm0, xmm5
1948 paddd xmm0, xmm1
1949 pxor xmm3, xmm0
1950 pshufb xmm3, xmm14
1951 paddd xmm2, xmm3
1952 pxor xmm1, xmm2
1953 movdqa xmm11, xmm1
1954 pslld xmm1, 25
1955 psrld xmm11, 7
1956 por xmm1, xmm11
1957 pshufd xmm0, xmm0, 0x93
1958 pshufd xmm3, xmm3, 0x4E
1959 pshufd xmm2, xmm2, 0x39
1960 paddd xmm0, xmm6
1961 paddd xmm0, xmm1
1962 pxor xmm3, xmm0
1963 pshufb xmm3, xmm15
1964 paddd xmm2, xmm3
1965 pxor xmm1, xmm2
1966 movdqa xmm11, xmm1
1967 pslld xmm1, 20
1968 psrld xmm11, 12
1969 por xmm1, xmm11
1970 paddd xmm0, xmm7
1971 paddd xmm0, xmm1
1972 pxor xmm3, xmm0
1973 pshufb xmm3, xmm14
1974 paddd xmm2, xmm3
1975 pxor xmm1, xmm2
1976 movdqa xmm11, xmm1
1977 pslld xmm1, 25
1978 psrld xmm11, 7
1979 por xmm1, xmm11
1980 pshufd xmm0, xmm0, 0x39
1981 pshufd xmm3, xmm3, 0x4E
1982 pshufd xmm2, xmm2, 0x93
1983 dec al
1984 jz 9f
1985 movdqa xmm8, xmm4
1986 shufps xmm8, xmm5, 214
1987 pshufd xmm9, xmm4, 0x0F
1988 pshufd xmm4, xmm8, 0x39
1989 movdqa xmm8, xmm6
1990 shufps xmm8, xmm7, 250
1991 pblendw xmm9, xmm8, 0xCC
1992 movdqa xmm8, xmm7
1993 punpcklqdq xmm8, xmm5
1994 pblendw xmm8, xmm6, 0xC0
1995 pshufd xmm8, xmm8, 0x78
1996 punpckhdq xmm5, xmm7
1997 punpckldq xmm6, xmm5
1998 pshufd xmm7, xmm6, 0x1E
1999 movdqa xmm5, xmm9
2000 movdqa xmm6, xmm8
2001 jmp 9b
20029:
2003 movdqu xmm4, xmmword ptr [rdi]
2004 movdqu xmm5, xmmword ptr [rdi+0x10]
2005 pxor xmm0, xmm2
2006 pxor xmm1, xmm3
2007 pxor xmm2, xmm4
2008 pxor xmm3, xmm5
2009 movups xmmword ptr [r9], xmm0
2010 movups xmmword ptr [r9+0x10], xmm1
2011 movups xmmword ptr [r9+0x20], xmm2
2012 movups xmmword ptr [r9+0x30], xmm3
2013 ret
2014
2015
2016#ifdef __APPLE__
2017.static_data
2018#else
2019.section .rodata
2020#endif
2021.p2align 6
2022BLAKE3_IV:
2023 .long 0x6A09E667, 0xBB67AE85
2024 .long 0x3C6EF372, 0xA54FF53A
2025ROT16:
2026 .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2027ROT8:
2028 .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2029ADD0:
2030 .long 0, 1, 2, 3
2031ADD1:
2032 .long 4, 4, 4, 4
2033BLAKE3_IV_0:
2034 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2035BLAKE3_IV_1:
2036 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2037BLAKE3_IV_2:
2038 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2039BLAKE3_IV_3:
2040 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2041BLAKE3_BLOCK_LEN:
2042 .long 64, 64, 64, 64
2043CMP_MSB_MASK:
2044 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2045
2046#endif
2047