aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-30 23:13:47 -0400
committerMatthew Krupcale <[email protected]>2020-08-30 23:13:47 -0400
commitc592e9a3f604fa6c62ef547639723b5962529885 (patch)
tree29a3f9bb629393d7f723b1eebb0afb78991ecd4e
parentc33a8462d1e1770f91a1aa4c4854ae000ed865ae (diff)
C: asm: remove blendvps usage altogether
This simplifies the operation by removing the need to use blendvps at all.
-rw-r--r--c/blake3_sse2_x86-64_unix.S23
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S23
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm23
3 files changed, 18 insertions, 51 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index a72d40b..2dcf879 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -1915,23 +1915,12 @@ blake3_hash_many_sse2:
movups xmmword ptr [rbx+0x10], xmm1
movups xmmword ptr [rbx+0x20], xmm8
movups xmmword ptr [rbx+0x30], xmm9
- movdqa xmm0, xmmword ptr [rsp+0x130]
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm2, xmmword ptr [rsp+0x120]
- movdqu xmm3, xmmword ptr [rsp+0x118]
- movdqu xmm4, xmmword ptr [rsp+0x128]
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- pandn xmm5, xmm1
- pand xmm3, xmm6
- por xmm3, xmm5
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- pandn xmm5, xmm2
- pand xmm4, xmm6
- por xmm4, xmm5
- movdqa xmmword ptr [rsp+0x110], xmm3
- movdqa xmmword ptr [rsp+0x120], xmm4
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
add rdi, 16
add rbx, 64
sub rsi, 2
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 04ee6f4..a01c23c 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1926,23 +1926,12 @@ blake3_hash_many_sse2:
movups xmmword ptr [rbx+0x10], xmm1
movups xmmword ptr [rbx+0x20], xmm8
movups xmmword ptr [rbx+0x30], xmm9
- movdqa xmm0, xmmword ptr [rsp+0x130]
- movdqa xmm1, xmmword ptr [rsp+0x110]
- movdqa xmm2, xmmword ptr [rsp+0x120]
- movdqu xmm3, xmmword ptr [rsp+0x118]
- movdqu xmm4, xmmword ptr [rsp+0x128]
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- pandn xmm5, xmm1
- pand xmm3, xmm6
- por xmm3, xmm5
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- pandn xmm5, xmm2
- pand xmm4, xmm6
- por xmm4, xmm5
- movdqa xmmword ptr [rsp+0x110], xmm3
- movdqa xmmword ptr [rsp+0x120], xmm4
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
add rdi, 16
add rbx, 64
sub rsi, 2
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index e649a25..da510d8 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -1927,23 +1927,12 @@ endroundloop2:
movups xmmword ptr [rbx+10H], xmm1
movups xmmword ptr [rbx+20H], xmm8
movups xmmword ptr [rbx+30H], xmm9
- movdqa xmm0, xmmword ptr [rsp+130H]
- movdqa xmm1, xmmword ptr [rsp+110H]
- movdqa xmm2, xmmword ptr [rsp+120H]
- movdqu xmm3, xmmword ptr [rsp+118H]
- movdqu xmm4, xmmword ptr [rsp+128H]
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- pandn xmm5, xmm1
- pand xmm3, xmm6
- por xmm3, xmm5
- movdqa xmm5, xmm0
- movdqa xmm6, xmm0
- pandn xmm5, xmm2
- pand xmm4, xmm6
- por xmm4, xmm5
- movdqa xmmword ptr [rsp+110H], xmm3
- movdqa xmmword ptr [rsp+120H], xmm4
+ mov eax, dword ptr [rsp+130H]
+ neg eax
+ mov r10d, dword ptr [rsp+110H+8*rax]
+ mov r11d, dword ptr [rsp+120H+8*rax]
+ mov dword ptr [rsp+110H], r10d
+ mov dword ptr [rsp+120H], r11d
add rdi, 16
add rbx, 64
sub rsi, 2