aboutsummaryrefslogtreecommitdiff
path: root/c/blake3_sse2_x86-64_windows_gnu.S
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-31 00:21:47 -0400
committerMatthew Krupcale <[email protected]>2020-08-31 00:21:47 -0400
commit47e415c7f19d97b3a39720f9c892288e82d4bd99 (patch)
treee841078145d9c63f9f00f6da9025c2face04e82f /c/blake3_sse2_x86-64_windows_gnu.S
parentc592e9a3f604fa6c62ef547639723b5962529885 (diff)
C: asm: simplify pinsrd emulation
Use punpckl{,q}dq instead of pinsrw.
Diffstat (limited to 'c/blake3_sse2_x86-64_windows_gnu.S')
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S52
1 files changed, 17 insertions, 35 deletions
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index a01c23c..b2ee40d 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1667,24 +1667,12 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
- mov eax, dword ptr [rsp+0x120]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+0x120], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm14, word ptr [rsp+0x124], 2
- pinsrw xmm14, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm14, eax, 5
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@@ -1725,14 +1713,14 @@ blake3_hash_many_sse2:
pshufd xmm14, xmm14, 0x93
shufps xmm11, xmm15, 221
pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movd xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
- pinsrw xmm11, eax, 6
- pinsrw xmm11, r14d, 7
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
mov al, 7
9:
paddd xmm0, xmm4
@@ -1941,14 +1929,8 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
- mov eax, dword ptr [rsp+0x120]
- sar eax, 16
- pinsrw xmm13, word ptr [rsp+0x120], 2
- pinsrw xmm13, eax, 3
- mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
- sar eax, 16
- pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
- pinsrw xmm13, eax, 5
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
mov r8, qword ptr [rdi]
movzx eax, byte ptr [rbp+0x80]
or eax, r13d
@@ -1960,11 +1942,11 @@ blake3_hash_many_sse2:
cmp rdx, r15
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
- movaps xmm3, xmm13
- mov r14d, eax
- sar r14d, 16
- pinsrw xmm3, eax, 6
- pinsrw xmm3, r14d, 7
+ shl rax, 32
+ or rax, 64
+ movd xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4