aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-31 11:36:01 -0400
committerMatthew Krupcale <[email protected]>2020-08-31 12:12:42 -0400
commitbe2da69b6b293764867c42fcbc278627271d9710 (patch)
tree939daaec31af7466103b34e920a822dca6487b1a
parent47e415c7f19d97b3a39720f9c892288e82d4bd99 (diff)
C: asm: simplify pblendw emulation
Use statically calculated ~mask. This reduces the number of moves and registers necessary at the expense of an extra memory load. This is probably a good trade-off since we are not bound by memory uops in this loop.
-rw-r--r--c/blake3_sse2_x86-64_unix.S97
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S97
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm97
3 files changed, 108 insertions, 183 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index 8b26125..245c519 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -1833,24 +1833,17 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
- movdqa xmmword ptr [rsp+0x20], xmm2
- movdqa xmmword ptr [rsp+0x40], xmm3
- movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm13
- pand xmm3, xmm12
- movdqa xmm13, xmm3
- por xmm13, xmm2
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
- movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm12
- pand xmm3, xmm6
- movdqa xmm12, xmm3
+ movdqa xmmword ptr [rsp+0x20], xmm2
+ movdqa xmm2, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm12, xmm2
movdqa xmm2, xmmword ptr [rsp+0x20]
- movdqa xmm3, xmmword ptr [rsp+0x40]
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1864,24 +1857,17 @@ blake3_hash_many_sse2:
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
- movdqa xmmword ptr [rsp+0x30], xmm2
- movdqa xmmword ptr [rsp+0x50], xmm3
- movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm6
- pand xmm3, xmm5
- movdqa xmm6, xmm3
- por xmm6, xmm2
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
- movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm5
- pand xmm3, xmm14
- movdqa xmm5, xmm3
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+0x30]
- movdqa xmm3, xmmword ptr [rsp+0x50]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@@ -2013,19 +1999,14 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@@ -2133,19 +2114,14 @@ _blake3_compress_in_place_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@@ -2252,19 +2228,14 @@ _blake3_compress_xof_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@@ -2312,7 +2283,11 @@ BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
PBLENDW_0xCC_MASK:
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
PBLENDW_0xC0_MASK:
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index b2ee40d..0800f4a 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1844,24 +1844,17 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm12, 0x39
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
- movdqa xmmword ptr [rsp+0x20], xmm2
- movdqa xmmword ptr [rsp+0x40], xmm3
- movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm13
- pand xmm3, xmm12
- movdqa xmm13, xmm3
- por xmm13, xmm2
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
- movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm12
- pand xmm3, xmm6
- movdqa xmm12, xmm3
+ movdqa xmmword ptr [rsp+0x20], xmm2
+ movdqa xmm2, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm12, xmm2
movdqa xmm2, xmmword ptr [rsp+0x20]
- movdqa xmm3, xmmword ptr [rsp+0x40]
pshufd xmm12, xmm12, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1875,24 +1868,17 @@ blake3_hash_many_sse2:
pshufd xmm12, xmm5, 0x39
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
- movdqa xmmword ptr [rsp+0x30], xmm2
- movdqa xmmword ptr [rsp+0x50], xmm3
- movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm6
- pand xmm3, xmm5
- movdqa xmm6, xmm3
- por xmm6, xmm2
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
- movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm3, xmm2
- pandn xmm2, xmm5
- pand xmm3, xmm14
- movdqa xmm5, xmm3
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+0x30]
- movdqa xmm3, xmmword ptr [rsp+0x50]
pshufd xmm5, xmm5, 0x78
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@@ -2024,19 +2010,14 @@ blake3_hash_many_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@@ -2153,19 +2134,14 @@ _blake3_compress_in_place_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@@ -2289,19 +2265,14 @@ blake3_compress_xof_sse2:
pshufd xmm4, xmm8, 0x39
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
por xmm8, xmm10
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
@@ -2353,7 +2324,11 @@ BLAKE3_BLOCK_LEN:
.long 64, 64, 64, 64
CMP_MSB_MASK:
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
PBLENDW_0xCC_MASK:
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
PBLENDW_0xC0_MASK:
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index 70a3044..0a2d9cb 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -1845,24 +1845,17 @@ roundloop2:
pshufd xmm4, xmm12, 39H
movdqa xmm12, xmm6
shufps xmm12, xmm7, 250
- movdqa xmmword ptr [rsp+20H], xmm2
- movdqa xmmword ptr [rsp+40H], xmm3
- movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK]
- movdqa xmm3, xmm2
- pandn xmm2, xmm13
- pand xmm3, xmm12
- movdqa xmm13, xmm3
- por xmm13, xmm2
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm13, xmm12
movdqa xmm12, xmm7
punpcklqdq xmm12, xmm5
- movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
- movdqa xmm3, xmm2
- pandn xmm2, xmm12
- pand xmm3, xmm6
- movdqa xmm12, xmm3
+ movdqa xmmword ptr [rsp+20H], xmm2
+ movdqa xmm2, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm12, xmm2
movdqa xmm2, xmmword ptr [rsp+20H]
- movdqa xmm3, xmmword ptr [rsp+40H]
pshufd xmm12, xmm12, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -1876,24 +1869,17 @@ roundloop2:
pshufd xmm12, xmm5, 39H
movdqa xmm5, xmm14
shufps xmm5, xmm15, 250
- movdqa xmmword ptr [rsp+30H], xmm2
- movdqa xmmword ptr [rsp+50H], xmm3
- movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK]
- movdqa xmm3, xmm2
- pandn xmm2, xmm6
- pand xmm3, xmm5
- movdqa xmm6, xmm3
- por xmm6, xmm2
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm6, xmm5
movdqa xmm5, xmm15
punpcklqdq xmm5, xmm13
- movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
- movdqa xmm3, xmm2
- pandn xmm2, xmm5
- pand xmm3, xmm14
- movdqa xmm5, xmm3
+ movdqa xmmword ptr [rsp+30H], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm5, xmm2
movdqa xmm2, xmmword ptr [rsp+30H]
- movdqa xmm3, xmmword ptr [rsp+50H]
pshufd xmm5, xmm5, 78H
punpckhdq xmm13, xmm15
punpckldq xmm14, xmm13
@@ -2025,19 +2011,14 @@ roundloop1:
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
@@ -2155,19 +2136,14 @@ _blake3_compress_in_place_sse2 PROC
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
@@ -2292,19 +2268,14 @@ _blake3_compress_xof_sse2 PROC
pshufd xmm4, xmm8, 39H
movdqa xmm8, xmm6
shufps xmm8, xmm7, 250
- movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK]
- movdqa xmm11, xmm10
- pandn xmm10, xmm9
- pand xmm11, xmm8
- movdqa xmm9, xmm11
- por xmm9, xmm10
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
+ por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- movdqa xmm11, xmm10
- pandn xmm10, xmm8
- pand xmm11, xmm6
- movdqa xmm8, xmm11
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
por xmm8, xmm10
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
@@ -2368,8 +2339,12 @@ BLAKE3_BLOCK_LEN:
CMP_MSB_MASK:
dd 8 dup(80000000H)
+PBLENDW_0x33_MASK:
+ dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
PBLENDW_0xCC_MASK:
dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
+PBLENDW_0x3F_MASK:
+ dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
PBLENDW_0xC0_MASK:
dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH