aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-16 14:10:48 -0400
committerMatthew Krupcale <[email protected]>2020-08-24 00:57:28 -0400
commite632967a8da6205b118bb5921eb89f617ae8a12c (patch)
tree208d5aba1fb8989dc21c75c701b2b5c27cd657c9
parent460c9d3031052d248f10f89bdb75bc5262774c37 (diff)
C: asm: emulate blendvps using SSE2 instructions
Blend according to (mask & b) | ((~mask) & a). * c/blake3_sse2_x86-64_unix.S: emulate blendvps using SSE2 instructions for x86_64 unix * c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU. * c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
-rw-r--r--c/blake3_sse2_x86-64_unix.S16
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S16
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm16
3 files changed, 36 insertions, 12 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index 0ba7183..9382b8f 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -1694,10 +1694,18 @@ blake3_hash_many_sse2:
movdqa xmm2, xmmword ptr [rsp+0x120]
movdqu xmm3, xmmword ptr [rsp+0x118]
movdqu xmm4, xmmword ptr [rsp+0x128]
- blendvps xmm1, xmm3, xmm0
- blendvps xmm2, xmm4, xmm0
- movdqa xmmword ptr [rsp+0x110], xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pandn xmm5, xmm1
+ pand xmm3, xmm6
+ por xmm3, xmm5
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pandn xmm5, xmm2
+ pand xmm4, xmm6
+ por xmm4, xmm5
+ movdqa xmmword ptr [rsp+0x110], xmm3
+ movdqa xmmword ptr [rsp+0x120], xmm4
add rdi, 16
add rbx, 64
sub rsi, 2
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 8092f60..638e683 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1705,10 +1705,18 @@ blake3_hash_many_sse2:
movdqa xmm2, xmmword ptr [rsp+0x120]
movdqu xmm3, xmmword ptr [rsp+0x118]
movdqu xmm4, xmmword ptr [rsp+0x128]
- blendvps xmm1, xmm3, xmm0
- blendvps xmm2, xmm4, xmm0
- movdqa xmmword ptr [rsp+0x110], xmm1
- movdqa xmmword ptr [rsp+0x120], xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pandn xmm5, xmm1
+ pand xmm3, xmm6
+ por xmm3, xmm5
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pandn xmm5, xmm2
+ pand xmm4, xmm6
+ por xmm4, xmm5
+ movdqa xmmword ptr [rsp+0x110], xmm3
+ movdqa xmmword ptr [rsp+0x120], xmm4
add rdi, 16
add rbx, 64
sub rsi, 2
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index 93c6ec3..41507a0 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -1706,10 +1706,18 @@ endroundloop2:
movdqa xmm2, xmmword ptr [rsp+120H]
movdqu xmm3, xmmword ptr [rsp+118H]
movdqu xmm4, xmmword ptr [rsp+128H]
- blendvps xmm1, xmm3, xmm0
- blendvps xmm2, xmm4, xmm0
- movdqa xmmword ptr [rsp+110H], xmm1
- movdqa xmmword ptr [rsp+120H], xmm2
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pandn xmm5, xmm1
+ pand xmm3, xmm6
+ por xmm3, xmm5
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm0
+ pandn xmm5, xmm2
+ pand xmm4, xmm6
+ por xmm4, xmm5
+ movdqa xmmword ptr [rsp+110H], xmm3
+ movdqa xmmword ptr [rsp+120H], xmm4
add rdi, 16
add rbx, 64
sub rsi, 2