diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-15 13:39:52 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-24 00:57:09 -0400 |
| commit | 460c9d3031052d248f10f89bdb75bc5262774c37 (patch) | |
| tree | b67916f823e2cf7df34d4b044a8feb4514e4aafc | |
| parent | a9a701c6229c696817feeb32e526072689c9a256 (diff) | |
C: asm: emulate pblendw using SSE2 instructions
Use a constant mask to blend according to (mask & b) | ((~mask) & a).
* c/blake3_sse2_x86-64_unix.S: emulate pblendw using SSE2 instructions for x86_64 unix
* c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU.
* c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 82 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 82 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 82 |
3 files changed, 216 insertions, 30 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index 0d48f08..0ba7183 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -1619,10 +1619,24 @@ blake3_hash_many_sse2: pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC + movdqa xmmword ptr [rsp+0x20], xmm2 + movdqa xmmword ptr [rsp+0x40], xmm3 + movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm13 + pand xmm3, xmm12 + movdqa xmm13, xmm3 + por xmm13, xmm2 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 + movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm12 + pand xmm3, xmm6 + movdqa xmm12, xmm3 + por xmm12, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x20] + movdqa xmm3, xmmword ptr [rsp+0x40] pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1636,10 +1650,24 @@ blake3_hash_many_sse2: pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmmword ptr [rsp+0x50], xmm3 + movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm6 + pand xmm3, xmm5 + movdqa xmm6, xmm3 + por xmm6, xmm2 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 + movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm5 + pand xmm3, xmm14 + movdqa xmm5, xmm3 + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + movdqa xmm3, xmmword ptr [rsp+0x50] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 @@ -1765,10 +1793,20 @@ blake3_hash_many_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1869,10 +1907,20 @@ _blake3_compress_in_place_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1972,10 +2020,20 @@ _blake3_compress_xof_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2026,3 +2084,7 @@ BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index 9dcf65f..8092f60 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1630,10 +1630,24 @@ blake3_hash_many_sse2: pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0xCC + movdqa xmmword ptr [rsp+0x20], xmm2 + movdqa xmmword ptr [rsp+0x40], xmm3 + movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm13 + pand xmm3, xmm12 + movdqa xmm13, xmm3 + por xmm13, xmm2 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0xC0 + movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm12 + pand xmm3, xmm6 + movdqa xmm12, xmm3 + por xmm12, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x20] + movdqa xmm3, xmmword ptr [rsp+0x40] pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1647,10 +1661,24 @@ blake3_hash_many_sse2: pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0xCC + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmmword ptr [rsp+0x50], xmm3 + movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm6 + pand xmm3, xmm5 + movdqa xmm6, xmm3 + por xmm6, xmm2 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0xC0 + movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm3, xmm2 + pandn xmm2, xmm5 + pand xmm3, xmm14 + movdqa xmm5, xmm3 + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + movdqa xmm3, xmmword ptr [rsp+0x50] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 @@ -1776,10 +1804,20 @@ blake3_hash_many_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1889,10 +1927,20 @@ _blake3_compress_in_place_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2009,10 +2057,20 @@ blake3_compress_xof_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0xCC + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0xC0 + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2067,3 +2125,7 @@ BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index 85ba72d..93c6ec3 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -1631,10 +1631,24 @@ roundloop2: pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 - pblendw xmm13, xmm12, 0CCH + movdqa xmmword ptr [rsp+20H], xmm2 + movdqa xmmword ptr [rsp+40H], xmm3 + movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK] + movdqa xmm3, xmm2 + pandn xmm2, xmm13 + pand xmm3, xmm12 + movdqa xmm13, xmm3 + por xmm13, xmm2 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 - pblendw xmm12, xmm6, 0C0H + movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK] + movdqa xmm3, xmm2 + pandn xmm2, xmm12 + pand xmm3, xmm6 + movdqa xmm12, xmm3 + por xmm12, xmm2 + movdqa xmm2, xmmword ptr [rsp+20H] + movdqa xmm3, xmmword ptr [rsp+40H] pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1648,10 +1662,24 @@ roundloop2: pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 - pblendw xmm6, xmm5, 0CCH + movdqa xmmword ptr [rsp+30H], xmm2 + movdqa xmmword ptr [rsp+50H], xmm3 + movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK] + movdqa xmm3, xmm2 + pandn xmm2, xmm6 + pand xmm3, xmm5 + movdqa xmm6, xmm3 + por xmm6, xmm2 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 - pblendw xmm5, xmm14, 0C0H + movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK] + movdqa xmm3, xmm2 + pandn xmm2, xmm5 + pand xmm3, xmm14 + movdqa xmm5, xmm3 + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+30H] + movdqa xmm3, xmmword ptr [rsp+50H] pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 @@ -1777,10 +1805,20 @@ roundloop1: pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0CCH + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0C0H + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1891,10 +1929,20 @@ _blake3_compress_in_place_sse2 PROC pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0CCH + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0C0H + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2012,10 +2060,20 @@ _blake3_compress_xof_sse2 PROC pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - pblendw xmm9, xmm8, 0CCH + movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK] + movdqa xmm11, xmm10 + pandn xmm10, xmm9 + pand xmm11, xmm8 + movdqa xmm9, xmm11 + por xmm9, xmm10 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - pblendw xmm8, xmm6, 0C0H + movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + movdqa xmm11, xmm10 + pandn xmm10, xmm8 + pand xmm11, xmm6 + movdqa xmm8, xmm11 + por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2087,3 +2145,7 @@ CMP_MSB_MASK: _RDATA ENDS END +PBLENDW_0xCC_MASK: + dd 00000000H, FFFFFFFFH, 00000000H, FFFFFFFFH +PBLENDW_0xC0_MASK: + dd 00000000H, 00000000H, 00000000H, FFFFFFFFH |
