diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-23 17:48:09 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-24 00:57:39 -0400 |
| commit | e4681ec39e0780787cb6a003bab0c0f2bc6ef9be (patch) | |
| tree | f43066f670fbbe54e761c315b3591830515ae131 | |
| parent | 769c7cdc9663c596aa4a7604ad495b4481a93201 (diff) | |
C: asm: emulate pshufb ROT8 using SSE2 instructions
Use a simple shift for the rotation.
* c/blake3_sse2_x86-64_unix.S: emulate pshufb using SSE2 instructions for x86_64 unix
* c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU.
* c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 350 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 350 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 351 |
3 files changed, 792 insertions, 259 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index a3f177a..a72d40b 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -238,11 +238,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -327,11 +338,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -416,11 +438,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -505,11 +538,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -594,11 +638,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -683,11 +738,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -772,11 +838,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -861,11 +938,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -950,11 +1038,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1039,11 +1138,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1128,11 +1238,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1217,11 +1338,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1306,11 +1438,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1395,11 +1538,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1602,9 +1756,14 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1651,8 +1810,14 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1784,7 +1949,6 @@ blake3_hash_many_sse2: sar eax, 16 pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 pinsrw xmm13, eax, 5 - movaps xmm14, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d @@ -1830,7 +1994,10 @@ blake3_hash_many_sse2: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1854,7 +2021,10 @@ blake3_hash_many_sse2: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1928,7 +2098,6 @@ _blake3_compress_in_place_sse2: pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] mov al, 7 9: paddd xmm0, xmm4 @@ -1945,7 +2114,10 @@ _blake3_compress_in_place_sse2: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1969,7 +2141,10 @@ _blake3_compress_in_place_sse2: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2042,7 +2217,6 @@ _blake3_compress_xof_sse2: pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] mov al, 7 9: paddd xmm0, xmm4 @@ -2059,7 +2233,10 @@ _blake3_compress_xof_sse2: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2083,7 +2260,10 @@ _blake3_compress_xof_sse2: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2145,8 +2325,6 @@ _blake3_compress_xof_sse2: BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: .long 0, 1, 2, 3 ADD1: diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index f45d1c6..04ee6f4 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -237,11 +237,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -326,11 +337,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -415,11 +437,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -504,11 +537,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -593,11 +637,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -682,11 +737,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -771,11 +837,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -860,11 +937,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -949,11 +1037,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1038,11 +1137,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1127,11 +1237,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1216,11 +1337,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1305,11 +1437,22 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1394,11 +1537,22 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1613,9 +1767,14 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8+rip] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1662,8 +1821,14 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1795,7 +1960,6 @@ blake3_hash_many_sse2: sar eax, 16 pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 pinsrw xmm13, eax, 5 - movaps xmm14, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d @@ -1841,7 +2005,10 @@ blake3_hash_many_sse2: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1865,7 +2032,10 @@ blake3_hash_many_sse2: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1948,7 +2118,6 @@ _blake3_compress_in_place_sse2: pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] mov al, 7 9: paddd xmm0, xmm4 @@ -1965,7 +2134,10 @@ _blake3_compress_in_place_sse2: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1989,7 +2161,10 @@ _blake3_compress_in_place_sse2: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2079,7 +2254,6 @@ blake3_compress_xof_sse2: pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 - movaps xmm14, xmmword ptr [ROT8+rip] mov al, 7 9: paddd xmm0, xmm4 @@ -2096,7 +2270,10 @@ blake3_compress_xof_sse2: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2120,7 +2297,10 @@ blake3_compress_xof_sse2: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2186,8 +2366,6 @@ blake3_compress_xof_sse2: BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A -ROT8: - .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: .long 0, 1, 2, 3 ADD1: diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index 4ab5b11..a6a1932 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -238,11 +238,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -327,11 +338,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -416,11 +438,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -505,11 +538,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -594,11 +638,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -683,11 +738,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -772,11 +838,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -861,11 +938,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -950,11 +1038,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1039,11 +1138,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -1128,11 +1238,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1217,11 +1338,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -1306,11 +1438,22 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1395,11 +1538,22 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT8] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -1614,9 +1768,14 @@ roundloop2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - movaps xmm13, xmmword ptr [ROT8] - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1663,8 +1822,14 @@ roundloop2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - pshufb xmm3, xmm13 - pshufb xmm11, xmm13 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1796,7 +1961,6 @@ final1block: sar eax, 16 pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4 pinsrw xmm13, eax, 5 - movaps xmm14, xmmword ptr [ROT8] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d @@ -1842,7 +2006,10 @@ roundloop1: paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1866,7 +2033,10 @@ roundloop1: paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1950,7 +2120,6 @@ _blake3_compress_in_place_sse2 PROC pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H - movaps xmm14, xmmword ptr [ROT8] mov al, 7 @@: paddd xmm0, xmm4 @@ -1967,7 +2136,10 @@ _blake3_compress_in_place_sse2 PROC paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1991,7 +2163,10 @@ _blake3_compress_in_place_sse2 PROC paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2082,7 +2257,6 @@ _blake3_compress_xof_sse2 PROC pshufd xmm6, xmm6, 93H shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H - movaps xmm14, xmmword ptr [ROT8] mov al, 7 @@: paddd xmm0, xmm4 @@ -2099,7 +2273,10 @@ _blake3_compress_xof_sse2 PROC paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2123,7 +2300,10 @@ _blake3_compress_xof_sse2 PROC paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm14 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2214,9 +2394,6 @@ BLAKE3_IV_3: BLAKE3_BLOCK_LEN: dd 4 dup (64) -ROT8: - db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 - CMP_MSB_MASK: dd 8 dup(80000000H) |
