aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-23 09:46:56 -0400
committerMatthew Krupcale <[email protected]>2020-08-24 00:57:39 -0400
commit769c7cdc9663c596aa4a7604ad495b4481a93201 (patch)
treecfa110823fbd01572edc19d543162c2f64caa8ee
parent1ef915dbea5038811ce0a8e2effd138b13afbc2a (diff)
C: asm: emulate pshufb ROT16 using SSE2 instructions
Use two 16-bit shuffles: one for the low 64-bits and one for the high 64-bits. * c/blake3_sse2_x86-64_unix.S: emulate pshufb using SSE2 instructions for x86_64 unix * c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU. * c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
-rw-r--r--c/blake3_sse2_x86-64_unix.S218
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S218
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm219
3 files changed, 396 insertions, 259 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index e033793..a3f177a 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -192,11 +192,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -278,11 +281,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -364,11 +370,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -450,11 +459,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -536,11 +548,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -622,11 +637,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -708,11 +726,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -794,11 +815,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -880,11 +904,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -966,11 +993,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1052,11 +1082,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1138,11 +1171,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1224,11 +1260,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1310,11 +1349,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1536,9 +1578,10 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- movaps xmm12, xmmword ptr [ROT16+rip]
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1586,8 +1629,10 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1740,7 +1785,6 @@ blake3_hash_many_sse2:
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
@@ -1775,7 +1819,8 @@ blake3_hash_many_sse2:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1798,7 +1843,8 @@ blake3_hash_many_sse2:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1883,13 +1929,13 @@ _blake3_compress_in_place_sse2:
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1912,7 +1958,8 @@ _blake3_compress_in_place_sse2:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1996,13 +2043,13 @@ _blake3_compress_xof_sse2:
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2025,7 +2072,8 @@ _blake3_compress_xof_sse2:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2097,8 +2145,6 @@ _blake3_compress_xof_sse2:
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
-ROT16:
- .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 82e27ad..f45d1c6 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -191,11 +191,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -277,11 +280,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -363,11 +369,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -449,11 +458,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -535,11 +547,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -621,11 +636,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -707,11 +725,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -793,11 +814,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -879,11 +903,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -965,11 +992,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1051,11 +1081,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1137,11 +1170,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1223,11 +1259,14 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1309,11 +1348,14 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1547,9 +1589,10 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- movaps xmm12, xmmword ptr [ROT16+rip]
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1597,8 +1640,10 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1751,7 +1796,6 @@ blake3_hash_many_sse2:
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
@@ -1786,7 +1830,8 @@ blake3_hash_many_sse2:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1809,7 +1854,8 @@ blake3_hash_many_sse2:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1903,13 +1949,13 @@ _blake3_compress_in_place_sse2:
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1932,7 +1978,8 @@ _blake3_compress_in_place_sse2:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2033,13 +2080,13 @@ blake3_compress_xof_sse2:
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
movaps xmm14, xmmword ptr [ROT8+rip]
- movaps xmm15, xmmword ptr [ROT16+rip]
mov al, 7
9:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2062,7 +2109,8 @@ blake3_compress_xof_sse2:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2138,8 +2186,6 @@ blake3_compress_xof_sse2:
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
-ROT16:
- .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
ROT8:
.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index 2d3900f..4ab5b11 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -192,11 +192,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -278,11 +281,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -364,11 +370,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -450,11 +459,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -536,11 +548,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -622,11 +637,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -708,11 +726,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -794,11 +815,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -880,11 +904,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -966,11 +993,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -1052,11 +1082,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1138,11 +1171,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -1224,11 +1260,14 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1310,11 +1349,14 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT16]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ pshuflw xmm15, xmm15, B1H
+ pshufhw xmm15, xmm15, B1H
+ pshuflw xmm12, xmm12, B1H
+ pshufhw xmm12, xmm12, B1H
+ pshuflw xmm13, xmm13, B1H
+ pshufhw xmm13, xmm13, B1H
+ pshuflw xmm14, xmm14, B1H
+ pshufhw xmm14, xmm14, B1H
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -1548,9 +1590,10 @@ roundloop2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- movaps xmm12, xmmword ptr [ROT16]
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
+ pshuflw xmm11, xmm11, B1H
+ pshufhw xmm11, xmm11, B1H
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1598,8 +1641,10 @@ roundloop2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- pshufb xmm3, xmm12
- pshufb xmm11, xmm12
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
+ pshuflw xmm11, xmm11, B1H
+ pshufhw xmm11, xmm11, B1H
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1752,7 +1797,6 @@ final1block:
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
pinsrw xmm13, eax, 5
movaps xmm14, xmmword ptr [ROT8]
- movaps xmm15, xmmword ptr [ROT16]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
@@ -1787,7 +1831,8 @@ roundloop1:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1810,7 +1855,8 @@ roundloop1:
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1905,13 +1951,13 @@ _blake3_compress_in_place_sse2 PROC
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
movaps xmm14, xmmword ptr [ROT8]
- movaps xmm15, xmmword ptr [ROT16]
mov al, 7
@@:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1934,7 +1980,8 @@ _blake3_compress_in_place_sse2 PROC
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2036,13 +2083,13 @@ _blake3_compress_xof_sse2 PROC
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
movaps xmm14, xmmword ptr [ROT8]
- movaps xmm15, xmmword ptr [ROT16]
mov al, 7
@@:
paddd xmm0, xmm4
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2065,7 +2112,8 @@ _blake3_compress_xof_sse2 PROC
paddd xmm0, xmm6
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm15
+ pshuflw xmm3, xmm3, B1H
+ pshufhw xmm3, xmm3, B1H
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2166,9 +2214,6 @@ BLAKE3_IV_3:
BLAKE3_BLOCK_LEN:
dd 4 dup (64)
-ROT16:
- db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-
ROT8:
db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12