diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-30 23:13:47 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-30 23:13:47 -0400 |
| commit | c592e9a3f604fa6c62ef547639723b5962529885 (patch) | |
| tree | 29a3f9bb629393d7f723b1eebb0afb78991ecd4e | |
| parent | c33a8462d1e1770f91a1aa4c4854ae000ed865ae (diff) | |
C: asm: remove blendvps usage altogether
This simplifies the operation by removing the need to use blendvps at all.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 23 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 23 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 23 |
3 files changed, 18 insertions, 51 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index a72d40b..2dcf879 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -1915,23 +1915,12 @@ blake3_hash_many_sse2: movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - movdqa xmm5, xmm0 - movdqa xmm6, xmm0 - pandn xmm5, xmm1 - pand xmm3, xmm6 - por xmm3, xmm5 - movdqa xmm5, xmm0 - movdqa xmm6, xmm0 - pandn xmm5, xmm2 - pand xmm4, xmm6 - por xmm4, xmm5 - movdqa xmmword ptr [rsp+0x110], xmm3 - movdqa xmmword ptr [rsp+0x120], xmm4 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index 04ee6f4..a01c23c 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1926,23 +1926,12 @@ blake3_hash_many_sse2: movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 - movdqa xmm0, xmmword ptr [rsp+0x130] - movdqa xmm1, xmmword ptr [rsp+0x110] - movdqa xmm2, xmmword ptr [rsp+0x120] - movdqu xmm3, xmmword ptr [rsp+0x118] - movdqu xmm4, xmmword ptr [rsp+0x128] - movdqa xmm5, xmm0 - movdqa xmm6, xmm0 - pandn xmm5, xmm1 - pand xmm3, xmm6 - por xmm3, xmm5 - movdqa xmm5, xmm0 - movdqa xmm6, xmm0 - pandn xmm5, xmm2 - pand xmm4, xmm6 - por xmm4, xmm5 - movdqa xmmword ptr [rsp+0x110], xmm3 - movdqa xmmword ptr [rsp+0x120], xmm4 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index e649a25..da510d8 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -1927,23 +1927,12 @@ endroundloop2: movups xmmword ptr [rbx+10H], xmm1 movups xmmword ptr [rbx+20H], xmm8 movups xmmword ptr [rbx+30H], xmm9 - movdqa xmm0, xmmword ptr [rsp+130H] - movdqa xmm1, xmmword ptr [rsp+110H] - movdqa xmm2, xmmword ptr [rsp+120H] - movdqu xmm3, xmmword ptr [rsp+118H] - movdqu xmm4, xmmword ptr [rsp+128H] - movdqa xmm5, xmm0 - movdqa xmm6, xmm0 - pandn xmm5, xmm1 - pand xmm3, xmm6 - por xmm3, xmm5 - movdqa xmm5, xmm0 - movdqa xmm6, xmm0 - pandn xmm5, xmm2 - pand xmm4, xmm6 - por xmm4, xmm5 - movdqa xmmword ptr [rsp+110H], xmm3 - movdqa xmmword ptr [rsp+120H], xmm4 + mov eax, dword ptr [rsp+130H] + neg eax + mov r10d, dword ptr [rsp+110H+8*rax] + mov r11d, dword ptr [rsp+120H+8*rax] + mov dword ptr [rsp+110H], r10d + mov dword ptr [rsp+120H], r11d add rdi, 16 add rbx, 64 sub rsi, 2 |
