aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-15 13:39:52 -0400
committerMatthew Krupcale <[email protected]>2020-08-24 00:57:09 -0400
commit460c9d3031052d248f10f89bdb75bc5262774c37 (patch)
treeb67916f823e2cf7df34d4b044a8feb4514e4aafc
parenta9a701c6229c696817feeb32e526072689c9a256 (diff)
C: asm: emulate pblendw using SSE2 instructions
Use a constant mask to blend according to (mask & b) | ((~mask) & a). * c/blake3_sse2_x86-64_unix.S: emulate pblendw using SSE2 instructions for x86_64 unix * c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU. * c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
-rw-r--r--c/blake3_sse2_x86-64_unix.S82
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S82
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm82
3 files changed, 216 insertions, 30 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index 0d48f08..0ba7183 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -1619,10 +1619,24 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
- pblendw xmm13, xmm12, 0xCC
+ movdqa xmmword ptr [rsp+0x20], xmm2
+ movdqa xmmword ptr [rsp+0x40], xmm3
+ movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm13
+ pand xmm3, xmm12
+ movdqa xmm13, xmm3
+ por xmm13, xmm2
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
- pblendw xmm12, xmm6, 0xC0
+ movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm12
+ pand xmm3, xmm6
+ movdqa xmm12, xmm3
+ por xmm12, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x20]
+ movdqa xmm3, xmmword ptr [rsp+0x40]
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1636,10 +1650,24 @@ blake3_hash_many_sse2:
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
- pblendw xmm6, xmm5, 0xCC
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmmword ptr [rsp+0x50], xmm3
+ movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm6
+ pand xmm3, xmm5
+ movdqa xmm6, xmm3
+ por xmm6, xmm2
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
- pblendw xmm5, xmm14, 0xC0
+ movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm5
+ pand xmm3, xmm14
+ movdqa xmm5, xmm3
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ movdqa xmm3, xmmword ptr [rsp+0x50]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@@ -1765,10 +1793,20 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1869,10 +1907,20 @@ _blake3_compress_in_place_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1972,10 +2020,20 @@ _blake3_compress_xof_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2026,3 +2084,7 @@ BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 9dcf65f..8092f60 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1630,10 +1630,24 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
- pblendw xmm13, xmm12, 0xCC
+ movdqa xmmword ptr [rsp+0x20], xmm2
+ movdqa xmmword ptr [rsp+0x40], xmm3
+ movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm13
+ pand xmm3, xmm12
+ movdqa xmm13, xmm3
+ por xmm13, xmm2
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
- pblendw xmm12, xmm6, 0xC0
+ movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm12
+ pand xmm3, xmm6
+ movdqa xmm12, xmm3
+ por xmm12, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x20]
+ movdqa xmm3, xmmword ptr [rsp+0x40]
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1647,10 +1661,24 @@ blake3_hash_many_sse2:
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
- pblendw xmm6, xmm5, 0xCC
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmmword ptr [rsp+0x50], xmm3
+ movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm6
+ pand xmm3, xmm5
+ movdqa xmm6, xmm3
+ por xmm6, xmm2
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
- pblendw xmm5, xmm14, 0xC0
+ movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm5
+ pand xmm3, xmm14
+ movdqa xmm5, xmm3
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ movdqa xmm3, xmmword ptr [rsp+0x50]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@@ -1776,10 +1804,20 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1889,10 +1927,20 @@ _blake3_compress_in_place_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2009,10 +2057,20 @@ blake3_compress_xof_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0xCC
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0xC0
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2067,3 +2125,7 @@ BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index 85ba72d..93c6ec3 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -1631,10 +1631,24 @@ roundloop2:
pshufd xmm4, xmm12, 39H
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
- pblendw xmm13, xmm12, 0CCH
+ movdqa xmmword ptr [rsp+20H], xmm2
+ movdqa xmmword ptr [rsp+40H], xmm3
+ movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm13
+ pand xmm3, xmm12
+ movdqa xmm13, xmm3
+ por xmm13, xmm2
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
- pblendw xmm12, xmm6, 0C0H
+ movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm12
+ pand xmm3, xmm6
+ movdqa xmm12, xmm3
+ por xmm12, xmm2
+ movdqa xmm2, xmmword ptr [rsp+20H]
+ movdqa xmm3, xmmword ptr [rsp+40H]
pshufd xmm12, xmm12, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1648,10 +1662,24 @@ roundloop2:
pshufd xmm12, xmm5, 39H
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
- pblendw xmm6, xmm5, 0CCH
+ movdqa xmmword ptr [rsp+30H], xmm2
+ movdqa xmmword ptr [rsp+50H], xmm3
+ movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm6
+ pand xmm3, xmm5
+ movdqa xmm6, xmm3
+ por xmm6, xmm2
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
- pblendw xmm5, xmm14, 0C0H
+ movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
+ movdqa xmm3, xmm2
+ pandn xmm2, xmm5
+ pand xmm3, xmm14
+ movdqa xmm5, xmm3
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+30H]
+ movdqa xmm3, xmmword ptr [rsp+50H]
pshufd xmm5, xmm5, 78H
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@@ -1777,10 +1805,20 @@ roundloop1:
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0CCH
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0C0H
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1891,10 +1929,20 @@ _blake3_compress_in_place_sse2 PROC
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0CCH
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0C0H
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2012,10 +2060,20 @@ _blake3_compress_xof_sse2 PROC
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- pblendw xmm9, xmm8, 0CCH
+ movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm9
+ pand xmm11, xmm8
+ movdqa xmm9, xmm11
+ por xmm9, xmm10
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- pblendw xmm8, xmm6, 0C0H
+ movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
+ movdqa xmm11, xmm10
+ pandn xmm10, xmm8
+ pand xmm11, xmm6
+ movdqa xmm8, xmm11
+ por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2087,3 +2145,7 @@ CMP_MSB_MASK:
_RDATA ENDS
END
+PBLENDW_0xCC_MASK:
+ dd 00000000H, FFFFFFFFH, 00000000H, FFFFFFFFH
+PBLENDW_0xC0_MASK:
+ dd 00000000H, 00000000H, 00000000H, FFFFFFFFH