diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-16 14:10:48 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-24 00:57:28 -0400 |
| commit | e632967a8da6205b118bb5921eb89f617ae8a12c (patch) | |
| tree | 208d5aba1fb8989dc21c75c701b2b5c27cd657c9 | |
| parent | 460c9d3031052d248f10f89bdb75bc5262774c37 (diff) | |
C: asm: emulate blendvps using SSE2 instructions
Blend according to (mask & b) | ((~mask) & a).
* c/blake3_sse2_x86-64_unix.S: emulate blendvps using SSE2 instructions for x86_64 unix
* c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU.
* c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 16 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 16 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 16 |
3 files changed, 36 insertions, 12 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index 0ba7183..9382b8f 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -1694,10 +1694,18 @@ blake3_hash_many_sse2: movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pandn xmm5, xmm1 + pand xmm3, xmm6 + por xmm3, xmm5 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pandn xmm5, xmm2 + pand xmm4, xmm6 + por xmm4, xmm5 + movdqa xmmword ptr [rsp+0x110], xmm3 + movdqa xmmword ptr [rsp+0x120], xmm4 add rdi, 16 add rbx, 64 sub rsi, 2 diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index 8092f60..638e683 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1705,10 +1705,18 @@ blake3_hash_many_sse2: movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+0x110], xmm1 - movdqa xmmword ptr [rsp+0x120], xmm2 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pandn xmm5, xmm1 + pand xmm3, xmm6 + por xmm3, xmm5 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pandn xmm5, xmm2 + pand xmm4, xmm6 + por xmm4, xmm5 + movdqa xmmword ptr [rsp+0x110], xmm3 + movdqa xmmword ptr [rsp+0x120], xmm4 add rdi, 16 add rbx, 64 sub rsi, 2 diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index 93c6ec3..41507a0 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -1706,10 +1706,18 @@ endroundloop2: movdqa xmm2, xmmword ptr [rsp+120H] movdqu xmm3, xmmword ptr [rsp+118H] movdqu xmm4, xmmword ptr [rsp+128H] - blendvps xmm1, xmm3, xmm0 - blendvps xmm2, xmm4, xmm0 - movdqa xmmword ptr [rsp+110H], xmm1 - movdqa xmmword ptr [rsp+120H], xmm2 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pandn xmm5, xmm1 + pand xmm3, xmm6 + por xmm3, xmm5 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pandn xmm5, xmm2 + pand xmm4, xmm6 + por xmm4, xmm5 + movdqa xmmword ptr [rsp+110H], xmm3 + movdqa xmmword ptr [rsp+120H], xmm4 add rdi, 16 add rbx, 64 sub rsi, 2 |
