diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-23 09:46:56 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-24 00:57:39 -0400 |
| commit | 769c7cdc9663c596aa4a7604ad495b4481a93201 (patch) | |
| tree | cfa110823fbd01572edc19d543162c2f64caa8ee | |
| parent | 1ef915dbea5038811ce0a8e2effd138b13afbc2a (diff) | |
C: asm: emulate pshufb ROT16 using SSE2 instructions
Use two 16-bit shuffles: one for the low 64-bits and one for the high 64-bits.
* c/blake3_sse2_x86-64_unix.S: emulate pshufb using SSE2 instructions for x86_64 unix
* c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU.
* c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 218 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 218 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 219 |
3 files changed, 396 insertions, 259 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index e033793..a3f177a 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -192,11 +192,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -278,11 +281,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -364,11 +370,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -450,11 +459,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -536,11 +548,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -622,11 +637,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -708,11 +726,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -794,11 +815,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -880,11 +904,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -966,11 +993,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1052,11 +1082,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1138,11 +1171,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1224,11 +1260,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1310,11 +1349,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1536,9 +1578,10 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1586,8 +1629,10 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1740,7 +1785,6 @@ blake3_hash_many_sse2: pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 pinsrw xmm13, eax, 5 movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d @@ -1775,7 +1819,8 @@ blake3_hash_many_sse2: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1798,7 +1843,8 @@ blake3_hash_many_sse2: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1883,13 +1929,13 @@ _blake3_compress_in_place_sse2: shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1912,7 +1958,8 @@ _blake3_compress_in_place_sse2: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1996,13 +2043,13 @@ _blake3_compress_xof_sse2: shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2025,7 +2072,8 @@ _blake3_compress_xof_sse2: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2097,8 +2145,6 @@ _blake3_compress_xof_sse2: BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index 82e27ad..f45d1c6 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -191,11 +191,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -277,11 +280,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -363,11 +369,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -449,11 +458,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -535,11 +547,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -621,11 +636,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -707,11 +725,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -793,11 +814,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -879,11 +903,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -965,11 +992,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1051,11 +1081,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1137,11 +1170,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1223,11 +1259,14 @@ blake3_hash_many_sse2: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1309,11 +1348,14 @@ blake3_hash_many_sse2: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16+rip] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] @@ -1547,9 +1589,10 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16+rip] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1597,8 +1640,10 @@ blake3_hash_many_sse2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1751,7 +1796,6 @@ blake3_hash_many_sse2: pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 pinsrw xmm13, eax, 5 movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d @@ -1786,7 +1830,8 @@ blake3_hash_many_sse2: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1809,7 +1854,8 @@ blake3_hash_many_sse2: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1903,13 +1949,13 @@ _blake3_compress_in_place_sse2: shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1932,7 +1978,8 @@ _blake3_compress_in_place_sse2: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2033,13 +2080,13 @@ blake3_compress_xof_sse2: shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] - movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2062,7 +2109,8 @@ blake3_compress_xof_sse2: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2138,8 +2186,6 @@ blake3_compress_xof_sse2: BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A -ROT16: - .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 ADD0: diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index 2d3900f..4ab5b11 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -192,11 +192,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [BLAKE3_IV_0] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -278,11 +281,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -364,11 +370,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -450,11 +459,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -536,11 +548,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -622,11 +637,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -708,11 +726,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -794,11 +815,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -880,11 +904,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -966,11 +993,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -1052,11 +1082,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1138,11 +1171,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -1224,11 +1260,14 @@ innerloop4: pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 - pshufb xmm15, xmm8 + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H movdqa xmm8, xmmword ptr [rsp+100H] paddd xmm8, xmm12 paddd xmm9, xmm13 @@ -1310,11 +1349,14 @@ innerloop4: pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 - movdqa xmm8, xmmword ptr [ROT16] - pshufb xmm15, xmm8 - pshufb xmm12, xmm8 - pshufb xmm13, xmm8 - pshufb xmm14, xmm8 + pshuflw xmm15, xmm15, B1H + pshufhw xmm15, xmm15, B1H + pshuflw xmm12, xmm12, B1H + pshufhw xmm12, xmm12, B1H + pshuflw xmm13, xmm13, B1H + pshufhw xmm13, xmm13, B1H + pshuflw xmm14, xmm14, B1H + pshufhw xmm14, xmm14, B1H paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+100H] @@ -1548,9 +1590,10 @@ roundloop2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - movaps xmm12, xmmword ptr [ROT16] - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H + pshuflw xmm11, xmm11, B1H + pshufhw xmm11, xmm11, B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1598,8 +1641,10 @@ roundloop2: paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 - pshufb xmm3, xmm12 - pshufb xmm11, xmm12 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H + pshuflw xmm11, xmm11, B1H + pshufhw xmm11, xmm11, B1H paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 @@ -1752,7 +1797,6 @@ final1block: pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4 pinsrw xmm13, eax, 5 movaps xmm14, xmmword ptr [ROT8] - movaps xmm15, xmmword ptr [ROT16] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+80H] or eax, r13d @@ -1787,7 +1831,8 @@ roundloop1: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1810,7 +1855,8 @@ roundloop1: paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1905,13 +1951,13 @@ _blake3_compress_in_place_sse2 PROC shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H movaps xmm14, xmmword ptr [ROT8] - movaps xmm15, xmmword ptr [ROT16] mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -1934,7 +1980,8 @@ _blake3_compress_in_place_sse2 PROC paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2036,13 +2083,13 @@ _blake3_compress_xof_sse2 PROC shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 93H movaps xmm14, xmmword ptr [ROT8] - movaps xmm15, xmmword ptr [ROT16] mov al, 7 @@: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2065,7 +2112,8 @@ _blake3_compress_xof_sse2 PROC paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 - pshufb xmm3, xmm15 + pshuflw xmm3, xmm3, B1H + pshufhw xmm3, xmm3, B1H paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 @@ -2166,9 +2214,6 @@ BLAKE3_IV_3: BLAKE3_BLOCK_LEN: dd 4 dup (64) -ROT16: - db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 - ROT8: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 |
