diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-31 00:21:47 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-31 00:21:47 -0400 |
| commit | 47e415c7f19d97b3a39720f9c892288e82d4bd99 (patch) | |
| tree | e841078145d9c63f9f00f6da9025c2face04e82f /c/blake3_sse2_x86-64_windows_gnu.S | |
| parent | c592e9a3f604fa6c62ef547639723b5962529885 (diff) | |
C: asm: simplify pinsrd emulation
Use punpckl{,q}dq instead of pinsrw.
Diffstat (limited to 'c/blake3_sse2_x86-64_windows_gnu.S')
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 52 |
1 files changed, 17 insertions, 35 deletions
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index a01c23c..b2ee40d 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1667,24 +1667,12 @@ blake3_hash_many_sse2: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] - mov eax, dword ptr [rsp+0x120] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+0x120], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm14, word ptr [rsp+0x124], 2 - pinsrw xmm14, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm14, eax, 5 + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] @@ -1725,14 +1713,14 @@ blake3_hash_many_sse2: pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movd xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 - pinsrw xmm11, eax, 6 - pinsrw xmm11, r14d, 7 + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 @@ -1941,14 +1929,8 @@ blake3_hash_many_sse2: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] - mov eax, dword ptr [rsp+0x120] - sar eax, 16 - pinsrw xmm13, word ptr [rsp+0x120], 2 - pinsrw xmm13, eax, 3 - mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] - sar eax, 16 - pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 - pinsrw xmm13, eax, 5 + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x80] or eax, r13d @@ -1960,11 +1942,11 @@ blake3_hash_many_sse2: cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] - movaps xmm3, xmm13 - mov r14d, eax - sar r14d, 16 - pinsrw xmm3, eax, 6 - pinsrw xmm3, r14d, 7 + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 |
