aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-23 17:48:09 -0400
committerMatthew Krupcale <[email protected]>2020-08-24 00:57:39 -0400
commite4681ec39e0780787cb6a003bab0c0f2bc6ef9be (patch)
treef43066f670fbbe54e761c315b3591830515ae131
parent769c7cdc9663c596aa4a7604ad495b4481a93201 (diff)
C: asm: emulate pshufb ROT8 using SSE2 instructions
Use a simple shift for the rotation. * c/blake3_sse2_x86-64_unix.S: emulate pshufb using SSE2 instructions for x86_64 unix * c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU. * c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
-rw-r--r--c/blake3_sse2_x86-64_unix.S350
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S350
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm351
3 files changed, 792 insertions, 259 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index a3f177a..a72d40b 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -238,11 +238,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -327,11 +338,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -416,11 +438,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -505,11 +538,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -594,11 +638,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -683,11 +738,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -772,11 +838,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -861,11 +938,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -950,11 +1038,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1039,11 +1138,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1128,11 +1238,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1217,11 +1338,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1306,11 +1438,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1395,11 +1538,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1602,9 +1756,14 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- movaps xmm13, xmmword ptr [ROT8+rip]
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1651,8 +1810,14 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1784,7 +1949,6 @@ blake3_hash_many_sse2:
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
- movaps xmm14, xmmword ptr [ROT8+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
@@ -1830,7 +1994,10 @@ blake3_hash_many_sse2:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1854,7 +2021,10 @@ blake3_hash_many_sse2:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1928,7 +2098,6 @@ _blake3_compress_in_place_sse2:
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
mov al, 7
9:
paddd xmm0, xmm4
@@ -1945,7 +2114,10 @@ _blake3_compress_in_place_sse2:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1969,7 +2141,10 @@ _blake3_compress_in_place_sse2:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2042,7 +2217,6 @@ _blake3_compress_xof_sse2:
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
mov al, 7
9:
paddd xmm0, xmm4
@@ -2059,7 +2233,10 @@ _blake3_compress_xof_sse2:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2083,7 +2260,10 @@ _blake3_compress_xof_sse2:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2145,8 +2325,6 @@ _blake3_compress_xof_sse2:
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
-ROT8:
- .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
.long 0, 1, 2, 3
ADD1:
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index f45d1c6..04ee6f4 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -237,11 +237,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -326,11 +337,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -415,11 +437,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -504,11 +537,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -593,11 +637,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -682,11 +737,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -771,11 +837,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -860,11 +937,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -949,11 +1037,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1038,11 +1137,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1127,11 +1237,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1216,11 +1337,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1305,11 +1437,22 @@ blake3_hash_many_sse2:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+0x100]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1394,11 +1537,22 @@ blake3_hash_many_sse2:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8+rip]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+0x100]
@@ -1613,9 +1767,14 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- movaps xmm13, xmmword ptr [ROT8+rip]
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1662,8 +1821,14 @@ blake3_hash_many_sse2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1795,7 +1960,6 @@ blake3_hash_many_sse2:
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
pinsrw xmm13, eax, 5
- movaps xmm14, xmmword ptr [ROT8+rip]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
@@ -1841,7 +2005,10 @@ blake3_hash_many_sse2:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1865,7 +2032,10 @@ blake3_hash_many_sse2:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1948,7 +2118,6 @@ _blake3_compress_in_place_sse2:
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
mov al, 7
9:
paddd xmm0, xmm4
@@ -1965,7 +2134,10 @@ _blake3_compress_in_place_sse2:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1989,7 +2161,10 @@ _blake3_compress_in_place_sse2:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2079,7 +2254,6 @@ blake3_compress_xof_sse2:
pshufd xmm6, xmm6, 0x93
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 0x93
- movaps xmm14, xmmword ptr [ROT8+rip]
mov al, 7
9:
paddd xmm0, xmm4
@@ -2096,7 +2270,10 @@ blake3_compress_xof_sse2:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2120,7 +2297,10 @@ blake3_compress_xof_sse2:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2186,8 +2366,6 @@ blake3_compress_xof_sse2:
BLAKE3_IV:
.long 0x6A09E667, 0xBB67AE85
.long 0x3C6EF372, 0xA54FF53A
-ROT8:
- .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
ADD0:
.long 0, 1, 2, 3
ADD1:
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index 4ab5b11..a6a1932 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -238,11 +238,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -327,11 +338,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -416,11 +438,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -505,11 +538,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -594,11 +638,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -683,11 +738,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -772,11 +838,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -861,11 +938,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -950,11 +1038,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1039,11 +1138,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -1128,11 +1238,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1217,11 +1338,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -1306,11 +1438,22 @@ innerloop4:
pxor xmm13, xmm1
pxor xmm14, xmm2
pxor xmm15, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
- pshufb xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
movdqa xmm8, xmmword ptr [rsp+100H]
paddd xmm8, xmm12
paddd xmm9, xmm13
@@ -1395,11 +1538,22 @@ innerloop4:
pxor xmm12, xmm1
pxor xmm13, xmm2
pxor xmm14, xmm3
- movdqa xmm8, xmmword ptr [ROT8]
- pshufb xmm15, xmm8
- pshufb xmm12, xmm8
- pshufb xmm13, xmm8
- pshufb xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
paddd xmm10, xmm15
paddd xmm11, xmm12
movdqa xmm8, xmmword ptr [rsp+100H]
@@ -1614,9 +1768,14 @@ roundloop2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- movaps xmm13, xmmword ptr [ROT8]
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1663,8 +1822,14 @@ roundloop2:
paddd xmm8, xmm9
pxor xmm3, xmm0
pxor xmm11, xmm8
- pshufb xmm3, xmm13
- pshufb xmm11, xmm13
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
paddd xmm2, xmm3
paddd xmm10, xmm11
pxor xmm1, xmm2
@@ -1796,7 +1961,6 @@ final1block:
sar eax, 16
pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
pinsrw xmm13, eax, 5
- movaps xmm14, xmmword ptr [ROT8]
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
@@ -1842,7 +2006,10 @@ roundloop1:
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1866,7 +2033,10 @@ roundloop1:
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1950,7 +2120,6 @@ _blake3_compress_in_place_sse2 PROC
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
- movaps xmm14, xmmword ptr [ROT8]
mov al, 7
@@:
paddd xmm0, xmm4
@@ -1967,7 +2136,10 @@ _blake3_compress_in_place_sse2 PROC
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -1991,7 +2163,10 @@ _blake3_compress_in_place_sse2 PROC
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2082,7 +2257,6 @@ _blake3_compress_xof_sse2 PROC
pshufd xmm6, xmm6, 93H
shufps xmm8, xmm7, 221
pshufd xmm7, xmm8, 93H
- movaps xmm14, xmmword ptr [ROT8]
mov al, 7
@@:
paddd xmm0, xmm4
@@ -2099,7 +2273,10 @@ _blake3_compress_xof_sse2 PROC
paddd xmm0, xmm5
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2123,7 +2300,10 @@ _blake3_compress_xof_sse2 PROC
paddd xmm0, xmm7
paddd xmm0, xmm1
pxor xmm3, xmm0
- pshufb xmm3, xmm14
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
paddd xmm2, xmm3
pxor xmm1, xmm2
movdqa xmm11, xmm1
@@ -2214,9 +2394,6 @@ BLAKE3_IV_3:
BLAKE3_BLOCK_LEN:
dd 4 dup (64)
-ROT8:
- db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
-
CMP_MSB_MASK:
dd 8 dup(80000000H)