aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-31 00:21:47 -0400
committerMatthew Krupcale <[email protected]>2020-08-31 00:21:47 -0400
commit47e415c7f19d97b3a39720f9c892288e82d4bd99 (patch)
treee841078145d9c63f9f00f6da9025c2face04e82f
parentc592e9a3f604fa6c62ef547639723b5962529885 (diff)
C: asm: simplify pinsrd emulation
Use punpckl{,q}dq instead of pinsrw.
-rw-r--r--c/blake3_sse2_x86-64_unix.S52
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S52
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm52
3 files changed, 51 insertions, 105 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index 2dcf879..8b26125 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -1656,24 +1656,12 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
- mov eax, dword ptr [rsp+0x120]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+0x120], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm14, word ptr [rsp+0x124], 2
- pinsrw xmm14, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm14, eax, 5
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@@ -1714,14 +1702,14 @@ blake3_hash_many_sse2:
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
- pinsrw xmm11, eax, 6
- pinsrw xmm11, r14d, 7
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
mov al, 7
9:
paddd xmm0, xmm4
@@ -1930,14 +1918,8 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
- mov eax, dword ptr [rsp+0x120]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+0x120], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
@@ -1949,11 +1931,11 @@ blake3_hash_many_sse2:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm3, xmm13
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index a01c23c..b2ee40d 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1667,24 +1667,12 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
- mov eax, dword ptr [rsp+0x120]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+0x120], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm14, word ptr [rsp+0x124], 2
- pinsrw xmm14, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm14, eax, 5
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@@ -1725,14 +1713,14 @@ blake3_hash_many_sse2:
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
- pinsrw xmm11, eax, 6
- pinsrw xmm11, r14d, 7
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
mov al, 7
9:
paddd xmm0, xmm4
@@ -1941,14 +1929,8 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
- mov eax, dword ptr [rsp+0x120]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+0x120], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
@@ -1960,11 +1942,11 @@ blake3_hash_many_sse2:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm3, xmm13
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index da510d8..70a3044 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -1668,24 +1668,12 @@ final3blocks:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+110H]
- mov eax, dword ptr [rsp+120H]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+120H], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+120H]
+ punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+114H]
- mov eax, dword ptr [BLAKE3_BLOCK_LEN]
- sar eax, 16
- pinsrw xmm14, word ptr [rsp+124H], 2
- pinsrw xmm14, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN]
- sar eax, 16
- pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN], 4
- pinsrw xmm14, eax, 5
+ movd xmm13, dword ptr [rsp+124H]
+ punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+10H], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
@@ -1726,14 +1714,14 @@ innerloop2:
pshufd xmm14, xmm14, 93H
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 93H
+ shl rax, 20H
+ or rax, 40H
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+20H], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+10H]
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
- pinsrw xmm11, eax, 6
- pinsrw xmm11, r14d, 7
+ punpcklqdq xmm3, xmmword ptr [rsp+20H]
+ punpcklqdq xmm11, xmmword ptr [rsp+20H]
mov al, 7
roundloop2:
paddd xmm0, xmm4
@@ -1942,14 +1930,8 @@ final1block:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movd xmm13, dword ptr [rsp+110H]
- mov eax, dword ptr [rsp+120H]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+120H], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+120H]
+ punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+80H]
or eax, r13d
@@ -1961,11 +1943,11 @@ innerloop1:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
- movaps xmm3, xmm13
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm8, xmm4