aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2022-03-12 01:14:10 -0500
committerJack O'Connor <[email protected]>2022-03-15 14:03:02 -0400
commitee558b2f3218cf77b58afb65a9e493fa99642080 (patch)
tree60a910ca7bba8095536e45afd14ef3683007c14d
parent2e5eb837e53722dc121cca50ff4fc27d3413dcea (diff)
generate blake3_{avx512,sse41,sse2}_compress with asm.py
-rwxr-xr-xasm/asm.py381
-rw-r--r--asm/out.S1359
-rw-r--r--benches/bench.rs34
-rw-r--r--src/kernel.rs85
4 files changed, 1859 insertions, 0 deletions
diff --git a/asm/asm.py b/asm/asm.py
new file mode 100755
index 0000000..f3b4b82
--- /dev/null
+++ b/asm/asm.py
@@ -0,0 +1,381 @@
+#! /usr/bin/env python3
+
+# Generate asm!
+
+from dataclasses import dataclass, replace
+
+X86_64 = "x86_64"
+AVX512 = "avx512"
+AVX2 = "avx2"
+SSE41 = "sse41"
+SSE2 = "sse2"
+LINUX = "linux"
+
+
+@dataclass
+class Target:
+ # x86_64
+ arch: str # "x86_64"
+
+ # sse2, sse41, avx2, avx512, neon
+ extension: str # "sse41"
+
+ # unix, windows_msvc, windows_gnu
+ os: str
+
+ def arg32(self, index):
+ system_v_args_32 = ["edi", "esi", "edx", "ecx", "r8d", "r9d"]
+ return system_v_args_32[index]
+
+ def arg64(self, index):
+ system_v_args_64 = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"]
+ return system_v_args_64[index]
+
+ def scratch32(self, index):
+ system_v_scratch_32 = ["eax", "r10d", "r11d"]
+ return system_v_scratch_32[index]
+
+ def scratch64(self, index):
+ system_v_scratch_64 = ["rax", "r10", "r11"]
+ return system_v_scratch_64[index]
+
+ def reg128(self, index):
+ assert self.arch == X86_64
+ return "xmm" + str(index)
+
+ def ret(self):
+ return "ret"
+
+
+def add_row(t, o, dest, src):
+ assert t.arch == X86_64
+ if t.extension == AVX512:
+ o.append(f"vpaddd xmm{dest}, xmm{dest}, xmm{src}")
+ elif t.extension in (SSE41, SSE2):
+ o.append(f"paddd xmm{dest}, xmm{src}")
+ else:
+ raise NotImplementedError
+
+
+def xor_row(t, o, dest, src):
+ assert t.arch == X86_64
+ if t.extension == AVX512:
+ o.append(f"vpxord xmm{dest}, xmm{dest}, xmm{src}")
+ elif t.extension in (SSE41, SSE2):
+ o.append(f"pxor xmm{dest}, xmm{src}")
+ else:
+ raise NotImplementedError
+
+
+# This is the >>> operation in G, not state diagonalization or message permutation.
+def bitrotate_row(t, o, reg, bits):
+ assert t.arch == X86_64
+ if t.extension == AVX512:
+ o.append(f"vprord xmm{reg}, xmm{reg}, {bits}")
+ elif t.extension == SSE41:
+ if bits == 16:
+ # xmm15 is initialized at the top of kernel_1.
+ o.append(f"pshufb xmm{reg}, xmm15")
+ elif bits == 8:
+ # xmm14 is initialized at the top of kernel_1.
+ o.append(f"pshufb xmm{reg}, xmm14")
+ else:
+ # Do two bitshifts, using register 11 as temp.
+ o.append(f"movdqa xmm11, xmm{reg}")
+ o.append(f"pslld xmm{reg}, {32 - bits}")
+ o.append(f"psrld xmm11, {bits}")
+ o.append(f"por xmm{reg}, xmm11")
+ elif t.extension == SSE2:
+ if bits == 16:
+ o.append(f"pshuflw xmm{reg}, xmm{reg}, 0xB1")
+ o.append(f"pshufhw xmm{reg}, xmm{reg}, 0xB1")
+ else:
+ # Do two bitshifts, using register 11 as temp.
+ o.append(f"movdqa xmm11, xmm{reg}")
+ o.append(f"pslld xmm{reg}, {32 - bits}")
+ o.append(f"psrld xmm11, {bits}")
+ o.append(f"por xmm{reg}, xmm11")
+ else:
+ raise NotImplementedError
+
+
+def diagonalize_state_rows(t, o):
+ if t.extension == AVX512:
+ o.append("vpshufd xmm0, xmm0, 0x93")
+ o.append("vpshufd xmm3, xmm3, 0x4E")
+ o.append("vpshufd xmm2, xmm2, 0x39")
+ elif t.extension in (SSE41, SSE2):
+ o.append("pshufd xmm0, xmm0, 0x93")
+ o.append("pshufd xmm3, xmm3, 0x4E")
+ o.append("pshufd xmm2, xmm2, 0x39")
+ else:
+ raise NotImplementedError
+
+
+def undiagonalize_state_rows(t, o):
+ if t.extension == AVX512:
+ o.append("vpshufd xmm0, xmm0, 0x39")
+ o.append("vpshufd xmm3, xmm3, 0x4E")
+ o.append("vpshufd xmm2, xmm2, 0x93")
+ elif t.extension in (SSE41, SSE2):
+ o.append("pshufd xmm0, xmm0, 0x39")
+ o.append("pshufd xmm3, xmm3, 0x4E")
+ o.append("pshufd xmm2, xmm2, 0x93")
+ else:
+ raise NotImplementedError
+
+
+def permute_message_rows(t, o):
+ if t.extension == AVX512:
+ o.append("vshufps xmm8, xmm4, xmm5, 214")
+ o.append("vpshufd xmm9, xmm4, 0x0F")
+ o.append("vpshufd xmm4, xmm8, 0x39")
+ o.append("vshufps xmm8, xmm6, xmm7, 250")
+ o.append("vpblendd xmm9, xmm9, xmm8, 0xAA")
+ o.append("vpunpcklqdq xmm8, xmm7, xmm5")
+ o.append("vpblendd xmm8, xmm8, xmm6, 0x88")
+ o.append("vpshufd xmm8, xmm8, 0x78")
+ o.append("vpunpckhdq xmm5, xmm5, xmm7")
+ o.append("vpunpckldq xmm6, xmm6, xmm5")
+ o.append("vpshufd xmm7, xmm6, 0x1E")
+ o.append("vmovdqa xmm5, xmm9")
+ o.append("vmovdqa xmm6, xmm8")
+ elif t.extension == SSE41:
+ o.append("movdqa xmm8, xmm4")
+ o.append("shufps xmm8, xmm5, 214")
+ o.append("pshufd xmm9, xmm4, 0x0F")
+ o.append("pshufd xmm4, xmm8, 0x39")
+ o.append("movdqa xmm8, xmm6")
+ o.append("shufps xmm8, xmm7, 250")
+ o.append("pblendw xmm9, xmm8, 0xCC")
+ o.append("movdqa xmm8, xmm7")
+ o.append("punpcklqdq xmm8, xmm5")
+ o.append("pblendw xmm8, xmm6, 0xC0")
+ o.append("pshufd xmm8, xmm8, 0x78")
+ o.append("punpckhdq xmm5, xmm7")
+ o.append("punpckldq xmm6, xmm5")
+ o.append("pshufd xmm7, xmm6, 0x1E")
+ o.append("movdqa xmm5, xmm9")
+ o.append("movdqa xmm6, xmm8")
+ elif t.extension == SSE2:
+ o.append("movdqa xmm8, xmm4")
+ o.append("shufps xmm8, xmm5, 214")
+ o.append("pshufd xmm9, xmm4, 0x0F")
+ o.append("pshufd xmm4, xmm8, 0x39")
+ o.append("movdqa xmm8, xmm6")
+ o.append("shufps xmm8, xmm7, 250")
+ o.append("pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]")
+ o.append("pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]")
+ o.append("por xmm9, xmm8")
+ o.append("movdqa xmm8, xmm7")
+ o.append("punpcklqdq xmm8, xmm5")
+ o.append("movdqa xmm10, xmm6")
+ o.append("pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]")
+ o.append("pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]")
+ o.append("por xmm8, xmm10")
+ o.append("pshufd xmm8, xmm8, 0x78")
+ o.append("punpckhdq xmm5, xmm7")
+ o.append("punpckldq xmm6, xmm5")
+ o.append("pshufd xmm7, xmm6, 0x1E")
+ o.append("movdqa xmm5, xmm9")
+ o.append("movdqa xmm6, xmm8")
+ else:
+ raise NotImplementedError
+
+
+def kernel_1(t, o):
+ o.append(f"blake3_{t.extension}_kernel_1:")
+ if t.extension == SSE41:
+ o.append(f"movaps xmm14, xmmword ptr [ROT8+rip]")
+ o.append(f"movaps xmm15, xmmword ptr [ROT16+rip]")
+ for round_number in range(7):
+ if round_number > 0:
+ # Un-diagonalize and permute before each round except the first.
+ # compress_finish() will also partially un-diagonalize.
+ permute_message_rows(t, o)
+ add_row(t, o, dest=0, src=4)
+ add_row(t, o, dest=0, src=1)
+ xor_row(t, o, dest=3, src=0)
+ bitrotate_row(t, o, reg=3, bits=16)
+ add_row(t, o, dest=2, src=3)
+ xor_row(t, o, dest=1, src=2)
+ bitrotate_row(t, o, reg=1, bits=12)
+ add_row(t, o, dest=0, src=5)
+ add_row(t, o, dest=0, src=1)
+ xor_row(t, o, dest=3, src=0)
+ bitrotate_row(t, o, reg=3, bits=8)
+ add_row(t, o, dest=2, src=3)
+ xor_row(t, o, dest=1, src=2)
+ bitrotate_row(t, o, reg=1, bits=7)
+ diagonalize_state_rows(t, o)
+ add_row(t, o, dest=0, src=6)
+ add_row(t, o, dest=0, src=1)
+ xor_row(t, o, dest=3, src=0)
+ bitrotate_row(t, o, reg=3, bits=16)
+ add_row(t, o, dest=2, src=3)
+ xor_row(t, o, dest=1, src=2)
+ bitrotate_row(t, o, reg=1, bits=12)
+ add_row(t, o, dest=0, src=7)
+ add_row(t, o, dest=0, src=1)
+ xor_row(t, o, dest=3, src=0)
+ bitrotate_row(t, o, reg=3, bits=8)
+ add_row(t, o, dest=2, src=3)
+ xor_row(t, o, dest=1, src=2)
+ bitrotate_row(t, o, reg=1, bits=7)
+ undiagonalize_state_rows(t, o)
+ # Xor the last two rows into the first two, but don't do the feed forward
+ # here. That's only done in the XOF case.
+ xor_row(t, o, dest=0, src=2)
+ xor_row(t, o, dest=1, src=3)
+ o.append(t.ret())
+
+
+def compress_setup(t, o):
+ if t.extension == AVX512:
+ o.append(f"vmovdqu xmm0, xmmword ptr [{t.arg64(0)}]")
+ o.append(f"vmovdqu xmm1, xmmword ptr [{t.arg64(0)}+0x10]")
+ o.append(f"shl {t.arg64(4)}, 32")
+ o.append(f"mov {t.arg32(3)}, {t.arg32(3)}")
+ o.append(f"or {t.arg64(3)}, {t.arg64(4)}")
+ o.append(f"vmovq xmm3, {t.arg64(2)}")
+ o.append(f"vmovq xmm4, {t.arg64(3)}")
+ o.append(f"vpunpcklqdq xmm3, xmm3, xmm4")
+ o.append(f"vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]")
+ o.append(f"vmovups xmm8, xmmword ptr [{t.arg64(1)}]")
+ o.append(f"vmovups xmm9, xmmword ptr [{t.arg64(1)}+0x10]")
+ o.append(f"vshufps xmm4, xmm8, xmm9, 136")
+ o.append(f"vshufps xmm5, xmm8, xmm9, 221")
+ o.append(f"vmovups xmm8, xmmword ptr [{t.arg64(1)}+0x20]")
+ o.append(f"vmovups xmm9, xmmword ptr [{t.arg64(1)}+0x30]")
+ o.append(f"vshufps xmm6, xmm8, xmm9, 136")
+ o.append(f"vshufps xmm7, xmm8, xmm9, 221")
+ o.append(f"vpshufd xmm6, xmm6, 0x93")
+ o.append(f"vpshufd xmm7, xmm7, 0x93")
+ elif t.extension in (SSE41, SSE2):
+ o.append(f"movups xmm0, xmmword ptr [{t.arg64(0)}]")
+ o.append(f"movups xmm1, xmmword ptr [{t.arg64(0)}+0x10]")
+ o.append(f"movaps xmm2, xmmword ptr [BLAKE3_IV+rip]")
+ o.append(f"shl {t.arg64(4)}, 32")
+ o.append(f"mov {t.arg32(3)}, {t.arg32(3)}")
+ o.append(f"or {t.arg64(3)}, {t.arg64(4)}")
+ o.append(f"vmovq xmm3, {t.arg64(2)}")
+ o.append(f"vmovq xmm4, {t.arg64(3)}")
+ o.append(f"punpcklqdq xmm3, xmm4")
+ o.append(f"movups xmm4, xmmword ptr [{t.arg64(1)}]")
+ o.append(f"movups xmm5, xmmword ptr [{t.arg64(1)}+0x10]")
+ o.append(f"movaps xmm8, xmm4")
+ o.append(f"shufps xmm4, xmm5, 136")
+ o.append(f"shufps xmm8, xmm5, 221")
+ o.append(f"movaps xmm5, xmm8")
+ o.append(f"movups xmm6, xmmword ptr [{t.arg64(1)}+0x20]")
+ o.append(f"movups xmm7, xmmword ptr [{t.arg64(1)}+0x30]")
+ o.append(f"movaps xmm8, xmm6")
+ o.append(f"shufps xmm6, xmm7, 136")
+ o.append(f"pshufd xmm6, xmm6, 0x93")
+ o.append(f"shufps xmm8, xmm7, 221")
+ o.append(f"pshufd xmm7, xmm8, 0x93")
+ else:
+ raise NotImplementedError
+
+
+def compress_finish(t, o):
+ if t.extension == AVX512:
+ o.append(f"vmovdqu xmmword ptr [{t.arg64(0)}], xmm0")
+ o.append(f"vmovdqu xmmword ptr [{t.arg64(0)}+0x10], xmm1")
+ elif t.extension in (SSE41, SSE2):
+ o.append(f"movups xmmword ptr [{t.arg64(0)}], xmm0")
+ o.append(f"movups xmmword ptr [{t.arg64(0)}+0x10], xmm1")
+ else:
+ raise NotImplementedError
+
+
+def compress(t, o):
+ name = f"blake3_{t.extension}_compress"
+ o.append(f".global {name}")
+ o.append(f"{name}:")
+ compress_setup(t, o)
+ o.append(f"call blake3_{t.extension}_kernel_1")
+ compress_finish(t, o)
+ o.append(t.ret())
+
+
+def emit_prelude(t, o):
+ # o.append(".intel_syntax noprefix")
+ pass
+
+
+def emit_sse2(t, o):
+ t = replace(t, extension=SSE2)
+ kernel_1(t, o)
+ compress(t, o)
+ o.append(".balign 16")
+ o.append("PBLENDW_0x33_MASK:")
+ o.append(".long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000")
+ o.append("PBLENDW_0xCC_MASK:")
+ o.append(".long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF")
+ o.append("PBLENDW_0x3F_MASK:")
+ o.append(".long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000")
+ o.append("PBLENDW_0xC0_MASK:")
+ o.append(".long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF")
+
+
+def emit_sse41(t, o):
+ t = replace(t, extension=SSE41)
+ kernel_1(t, o)
+ compress(t, o)
+ o.append(".balign 16")
+ o.append("ROT16:")
+ o.append(".byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13")
+ o.append("ROT8:")
+ o.append(".byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12")
+
+
+def emit_avx2(t, o):
+ t = replace(t, extension=AVX2)
+
+
+def emit_avx512(t, o):
+ t = replace(t, extension=AVX512)
+ kernel_1(t, o)
+ compress(t, o)
+
+
+def emit_footer(t, o):
+ o.append(".balign 16")
+ o.append("BLAKE3_IV:")
+ o.append("BLAKE3_IV_0:")
+ o.append(".long 0x6A09E667")
+ o.append("BLAKE3_IV_1:")
+ o.append(".long 0xBB67AE85")
+ o.append("BLAKE3_IV_2:")
+ o.append(".long 0x3C6EF372")
+ o.append("BLAKE3_IV_3:")
+ o.append(".long 0xA54FF53A")
+
+
+def format(output):
+ print("# DO NOT EDIT")
+ print("# This file is generated by asm.py.")
+ for item in output:
+ if ":" in item or item[0] == ".":
+ print(item)
+ else:
+ print(" " * 8 + item)
+
+
+def main():
+ target = Target(os=LINUX, arch=X86_64, extension=None)
+ output = []
+
+ emit_prelude(target, output)
+ emit_sse2(target, output)
+ emit_sse41(target, output)
+ emit_avx2(target, output)
+ emit_avx512(target, output)
+ emit_footer(target, output)
+
+ format(output)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/asm/out.S b/asm/out.S
new file mode 100644
index 0000000..20cacfc
--- /dev/null
+++ b/asm/out.S
@@ -0,0 +1,1359 @@
+# DO NOT EDIT
+# This file is generated by asm.py.
+blake3_sse2_kernel_1:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm11, xmm3
+ pslld xmm3, 24
+ psrld xmm11, 8
+ por xmm3, xmm11
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ ret
+.global blake3_sse2_compress
+blake3_sse2_compress:
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ mov ecx, ecx
+ or rcx, r8
+ vmovq xmm3, rdx
+ vmovq xmm4, rcx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ call blake3_sse2_kernel_1
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+.balign 16
+PBLENDW_0x33_MASK:
+.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
+blake3_sse41_kernel_1:
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ ret
+.global blake3_sse41_compress
+blake3_sse41_compress:
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ mov ecx, ecx
+ or rcx, r8
+ vmovq xmm3, rdx
+ vmovq xmm4, rcx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ call blake3_sse41_kernel_1
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+.balign 16
+ROT16:
+.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+blake3_avx512_kernel_1:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ vpxord xmm0, xmm0, xmm2
+ vpxord xmm1, xmm1, xmm3
+ ret
+.global blake3_avx512_compress
+blake3_avx512_compress:
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ shl r8, 32
+ mov ecx, ecx
+ or rcx, r8
+ vmovq xmm3, rdx
+ vmovq xmm4, rcx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ call blake3_avx512_kernel_1
+ vmovdqu xmmword ptr [rdi], xmm0
+ vmovdqu xmmword ptr [rdi+0x10], xmm1
+ ret
+.balign 16
+BLAKE3_IV:
+BLAKE3_IV_0:
+.long 0x6A09E667
+BLAKE3_IV_1:
+.long 0xBB67AE85
+BLAKE3_IV_2:
+.long 0x3C6EF372
+BLAKE3_IV_3:
+.long 0xA54FF53A
diff --git a/benches/bench.rs b/benches/bench.rs
index 68675c4..32e6319 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -108,6 +108,40 @@ fn bench_single_compression_avx512(b: &mut Bencher) {
}
}
+fn bench_kernel_compression_fn(b: &mut Bencher, f: blake3::kernel::CompressionFn) {
+ let mut state = [1u32; 8];
+ let mut r = RandomInput::new(b, 64);
+ let input = array_ref!(r.get(), 0, 64);
+ b.iter(|| unsafe { f(&mut state, input, 64, 0, 0) });
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_kernel_compression_sse2(b: &mut Bencher) {
+ if !is_x86_feature_detected!("sse2") {
+ return;
+ }
+ bench_kernel_compression_fn(b, blake3::kernel::blake3_sse2_compress);
+}
+
+#[bench]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn bench_kernel_compression_sse41(b: &mut Bencher) {
+ if !is_x86_feature_detected!("sse4.1") {
+ return;
+ }
+ bench_kernel_compression_fn(b, blake3::kernel::blake3_sse41_compress);
+}
+
+#[bench]
+#[cfg(blake3_avx512_ffi)]
+fn bench_kernel_compression_avx512(b: &mut Bencher) {
+ if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") {
+ return;
+ }
+ bench_kernel_compression_fn(b, blake3::kernel::blake3_avx512_compress);
+}
+
fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new();
diff --git a/src/kernel.rs b/src/kernel.rs
index 68c478d..0be9df3 100644
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -1,6 +1,91 @@
use crate::CHUNK_LEN;
use std::arch::{asm, global_asm};
+global_asm!(include_str!("../asm/out.S"));
+
+extern "C" {
+ pub fn blake3_sse2_compress(
+ cv: &[u32; 8],
+ block: &[u8; 64],
+ counter: u64,
+ block_len: u32,
+ flags: u32,
+ );
+ pub fn blake3_sse41_compress(
+ cv: &[u32; 8],
+ block: &[u8; 64],
+ counter: u64,
+ block_len: u32,
+ flags: u32,
+ );
+ pub fn blake3_avx512_compress(
+ cv: &[u32; 8],
+ block: &[u8; 64],
+ counter: u64,
+ block_len: u32,
+ flags: u32,
+ );
+}
+
+pub type CompressionFn =
+ unsafe extern "C" fn(cv: &[u32; 8], block: &[u8; 64], counter: u64, block_len: u32, flags: u32);
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ fn test_compression_function(f: CompressionFn) {
+ let mut block = [0; 64];
+ let block_len = 53;
+ crate::test::paint_test_input(&mut block[..block_len]);
+ let counter = u64::MAX - 42;
+ let flags = crate::CHUNK_START | crate::CHUNK_END | crate::ROOT;
+
+ let mut expected = *crate::IV;
+ crate::platform::Platform::Portable.compress_in_place(
+ &mut expected,
+ &block,
+ block_len as u8,
+ counter,
+ flags,
+ );
+
+ let mut found = *crate::IV;
+ unsafe {
+ f(&mut found, &block, counter, block_len as u32, flags as u32);
+ }
+
+ assert_eq!(expected, found);
+ }
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ fn test_sse2_compress() {
+ if !is_x86_feature_detected!("sse2") {
+ return;
+ }
+ test_compression_function(blake3_sse2_compress);
+ }
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ fn test_sse41_compress() {
+ if !is_x86_feature_detected!("sse4.1") {
+ return;
+ }
+ test_compression_function(blake3_sse41_compress);
+ }
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ fn test_avx512_compress() {
+ if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") {
+ return;
+ }
+ test_compression_function(blake3_avx512_compress);
+ }
+}
+
global_asm!(
// --------------------------------------------------------------------------------------------
// blake3_avx512_kernel_16