diff options
| author | Jack O'Connor <[email protected]> | 2022-03-12 01:14:10 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-03-15 14:03:02 -0400 |
| commit | ee558b2f3218cf77b58afb65a9e493fa99642080 (patch) | |
| tree | 60a910ca7bba8095536e45afd14ef3683007c14d | |
| parent | 2e5eb837e53722dc121cca50ff4fc27d3413dcea (diff) | |
generate blake3_{avx512,sse41,sse2}_compress with asm.py
| -rwxr-xr-x | asm/asm.py | 381 | ||||
| -rw-r--r-- | asm/out.S | 1359 | ||||
| -rw-r--r-- | benches/bench.rs | 34 | ||||
| -rw-r--r-- | src/kernel.rs | 85 |
4 files changed, 1859 insertions, 0 deletions
diff --git a/asm/asm.py b/asm/asm.py new file mode 100755 index 0000000..f3b4b82 --- /dev/null +++ b/asm/asm.py @@ -0,0 +1,381 @@ +#! /usr/bin/env python3 + +# Generate asm! + +from dataclasses import dataclass, replace + +X86_64 = "x86_64" +AVX512 = "avx512" +AVX2 = "avx2" +SSE41 = "sse41" +SSE2 = "sse2" +LINUX = "linux" + + +@dataclass +class Target: + # x86_64 + arch: str # "x86_64" + + # sse2, sse41, avx2, avx512, neon + extension: str # "sse41" + + # unix, windows_msvc, windows_gnu + os: str + + def arg32(self, index): + system_v_args_32 = ["edi", "esi", "edx", "ecx", "r8d", "r9d"] + return system_v_args_32[index] + + def arg64(self, index): + system_v_args_64 = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"] + return system_v_args_64[index] + + def scratch32(self, index): + system_v_scratch_32 = ["eax", "r10d", "r11d"] + return system_v_scratch_32[index] + + def scratch64(self, index): + system_v_scratch_64 = ["rax", "r10", "r11"] + return system_v_scratch_64[index] + + def reg128(self, index): + assert self.arch == X86_64 + return "xmm" + str(index) + + def ret(self): + return "ret" + + +def add_row(t, o, dest, src): + assert t.arch == X86_64 + if t.extension == AVX512: + o.append(f"vpaddd xmm{dest}, xmm{dest}, xmm{src}") + elif t.extension in (SSE41, SSE2): + o.append(f"paddd xmm{dest}, xmm{src}") + else: + raise NotImplementedError + + +def xor_row(t, o, dest, src): + assert t.arch == X86_64 + if t.extension == AVX512: + o.append(f"vpxord xmm{dest}, xmm{dest}, xmm{src}") + elif t.extension in (SSE41, SSE2): + o.append(f"pxor xmm{dest}, xmm{src}") + else: + raise NotImplementedError + + +# This is the >>> operation in G, not state diagonalization or message permutation. +def bitrotate_row(t, o, reg, bits): + assert t.arch == X86_64 + if t.extension == AVX512: + o.append(f"vprord xmm{reg}, xmm{reg}, {bits}") + elif t.extension == SSE41: + if bits == 16: + # xmm15 is initialized at the top of kernel_1. + o.append(f"pshufb xmm{reg}, xmm15") + elif bits == 8: + # xmm14 is initialized at the top of kernel_1. + o.append(f"pshufb xmm{reg}, xmm14") + else: + # Do two bitshifts, using register 11 as temp. + o.append(f"movdqa xmm11, xmm{reg}") + o.append(f"pslld xmm{reg}, {32 - bits}") + o.append(f"psrld xmm11, {bits}") + o.append(f"por xmm{reg}, xmm11") + elif t.extension == SSE2: + if bits == 16: + o.append(f"pshuflw xmm{reg}, xmm{reg}, 0xB1") + o.append(f"pshufhw xmm{reg}, xmm{reg}, 0xB1") + else: + # Do two bitshifts, using register 11 as temp. + o.append(f"movdqa xmm11, xmm{reg}") + o.append(f"pslld xmm{reg}, {32 - bits}") + o.append(f"psrld xmm11, {bits}") + o.append(f"por xmm{reg}, xmm11") + else: + raise NotImplementedError + + +def diagonalize_state_rows(t, o): + if t.extension == AVX512: + o.append("vpshufd xmm0, xmm0, 0x93") + o.append("vpshufd xmm3, xmm3, 0x4E") + o.append("vpshufd xmm2, xmm2, 0x39") + elif t.extension in (SSE41, SSE2): + o.append("pshufd xmm0, xmm0, 0x93") + o.append("pshufd xmm3, xmm3, 0x4E") + o.append("pshufd xmm2, xmm2, 0x39") + else: + raise NotImplementedError + + +def undiagonalize_state_rows(t, o): + if t.extension == AVX512: + o.append("vpshufd xmm0, xmm0, 0x39") + o.append("vpshufd xmm3, xmm3, 0x4E") + o.append("vpshufd xmm2, xmm2, 0x93") + elif t.extension in (SSE41, SSE2): + o.append("pshufd xmm0, xmm0, 0x39") + o.append("pshufd xmm3, xmm3, 0x4E") + o.append("pshufd xmm2, xmm2, 0x93") + else: + raise NotImplementedError + + +def permute_message_rows(t, o): + if t.extension == AVX512: + o.append("vshufps xmm8, xmm4, xmm5, 214") + o.append("vpshufd xmm9, xmm4, 0x0F") + o.append("vpshufd xmm4, xmm8, 0x39") + o.append("vshufps xmm8, xmm6, xmm7, 250") + o.append("vpblendd xmm9, xmm9, xmm8, 0xAA") + o.append("vpunpcklqdq xmm8, xmm7, xmm5") + o.append("vpblendd xmm8, xmm8, xmm6, 0x88") + o.append("vpshufd xmm8, xmm8, 0x78") + o.append("vpunpckhdq xmm5, xmm5, xmm7") + o.append("vpunpckldq xmm6, xmm6, xmm5") + o.append("vpshufd xmm7, xmm6, 0x1E") + o.append("vmovdqa xmm5, xmm9") + o.append("vmovdqa xmm6, xmm8") + elif t.extension == SSE41: + o.append("movdqa xmm8, xmm4") + o.append("shufps xmm8, xmm5, 214") + o.append("pshufd xmm9, xmm4, 0x0F") + o.append("pshufd xmm4, xmm8, 0x39") + o.append("movdqa xmm8, xmm6") + o.append("shufps xmm8, xmm7, 250") + o.append("pblendw xmm9, xmm8, 0xCC") + o.append("movdqa xmm8, xmm7") + o.append("punpcklqdq xmm8, xmm5") + o.append("pblendw xmm8, xmm6, 0xC0") + o.append("pshufd xmm8, xmm8, 0x78") + o.append("punpckhdq xmm5, xmm7") + o.append("punpckldq xmm6, xmm5") + o.append("pshufd xmm7, xmm6, 0x1E") + o.append("movdqa xmm5, xmm9") + o.append("movdqa xmm6, xmm8") + elif t.extension == SSE2: + o.append("movdqa xmm8, xmm4") + o.append("shufps xmm8, xmm5, 214") + o.append("pshufd xmm9, xmm4, 0x0F") + o.append("pshufd xmm4, xmm8, 0x39") + o.append("movdqa xmm8, xmm6") + o.append("shufps xmm8, xmm7, 250") + o.append("pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]") + o.append("pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]") + o.append("por xmm9, xmm8") + o.append("movdqa xmm8, xmm7") + o.append("punpcklqdq xmm8, xmm5") + o.append("movdqa xmm10, xmm6") + o.append("pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]") + o.append("pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]") + o.append("por xmm8, xmm10") + o.append("pshufd xmm8, xmm8, 0x78") + o.append("punpckhdq xmm5, xmm7") + o.append("punpckldq xmm6, xmm5") + o.append("pshufd xmm7, xmm6, 0x1E") + o.append("movdqa xmm5, xmm9") + o.append("movdqa xmm6, xmm8") + else: + raise NotImplementedError + + +def kernel_1(t, o): + o.append(f"blake3_{t.extension}_kernel_1:") + if t.extension == SSE41: + o.append(f"movaps xmm14, xmmword ptr [ROT8+rip]") + o.append(f"movaps xmm15, xmmword ptr [ROT16+rip]") + for round_number in range(7): + if round_number > 0: + # Un-diagonalize and permute before each round except the first. + # compress_finish() will also partially un-diagonalize. + permute_message_rows(t, o) + add_row(t, o, dest=0, src=4) + add_row(t, o, dest=0, src=1) + xor_row(t, o, dest=3, src=0) + bitrotate_row(t, o, reg=3, bits=16) + add_row(t, o, dest=2, src=3) + xor_row(t, o, dest=1, src=2) + bitrotate_row(t, o, reg=1, bits=12) + add_row(t, o, dest=0, src=5) + add_row(t, o, dest=0, src=1) + xor_row(t, o, dest=3, src=0) + bitrotate_row(t, o, reg=3, bits=8) + add_row(t, o, dest=2, src=3) + xor_row(t, o, dest=1, src=2) + bitrotate_row(t, o, reg=1, bits=7) + diagonalize_state_rows(t, o) + add_row(t, o, dest=0, src=6) + add_row(t, o, dest=0, src=1) + xor_row(t, o, dest=3, src=0) + bitrotate_row(t, o, reg=3, bits=16) + add_row(t, o, dest=2, src=3) + xor_row(t, o, dest=1, src=2) + bitrotate_row(t, o, reg=1, bits=12) + add_row(t, o, dest=0, src=7) + add_row(t, o, dest=0, src=1) + xor_row(t, o, dest=3, src=0) + bitrotate_row(t, o, reg=3, bits=8) + add_row(t, o, dest=2, src=3) + xor_row(t, o, dest=1, src=2) + bitrotate_row(t, o, reg=1, bits=7) + undiagonalize_state_rows(t, o) + # Xor the last two rows into the first two, but don't do the feed forward + # here. That's only done in the XOF case. + xor_row(t, o, dest=0, src=2) + xor_row(t, o, dest=1, src=3) + o.append(t.ret()) + + +def compress_setup(t, o): + if t.extension == AVX512: + o.append(f"vmovdqu xmm0, xmmword ptr [{t.arg64(0)}]") + o.append(f"vmovdqu xmm1, xmmword ptr [{t.arg64(0)}+0x10]") + o.append(f"shl {t.arg64(4)}, 32") + o.append(f"mov {t.arg32(3)}, {t.arg32(3)}") + o.append(f"or {t.arg64(3)}, {t.arg64(4)}") + o.append(f"vmovq xmm3, {t.arg64(2)}") + o.append(f"vmovq xmm4, {t.arg64(3)}") + o.append(f"vpunpcklqdq xmm3, xmm3, xmm4") + o.append(f"vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]") + o.append(f"vmovups xmm8, xmmword ptr [{t.arg64(1)}]") + o.append(f"vmovups xmm9, xmmword ptr [{t.arg64(1)}+0x10]") + o.append(f"vshufps xmm4, xmm8, xmm9, 136") + o.append(f"vshufps xmm5, xmm8, xmm9, 221") + o.append(f"vmovups xmm8, xmmword ptr [{t.arg64(1)}+0x20]") + o.append(f"vmovups xmm9, xmmword ptr [{t.arg64(1)}+0x30]") + o.append(f"vshufps xmm6, xmm8, xmm9, 136") + o.append(f"vshufps xmm7, xmm8, xmm9, 221") + o.append(f"vpshufd xmm6, xmm6, 0x93") + o.append(f"vpshufd xmm7, xmm7, 0x93") + elif t.extension in (SSE41, SSE2): + o.append(f"movups xmm0, xmmword ptr [{t.arg64(0)}]") + o.append(f"movups xmm1, xmmword ptr [{t.arg64(0)}+0x10]") + o.append(f"movaps xmm2, xmmword ptr [BLAKE3_IV+rip]") + o.append(f"shl {t.arg64(4)}, 32") + o.append(f"mov {t.arg32(3)}, {t.arg32(3)}") + o.append(f"or {t.arg64(3)}, {t.arg64(4)}") + o.append(f"vmovq xmm3, {t.arg64(2)}") + o.append(f"vmovq xmm4, {t.arg64(3)}") + o.append(f"punpcklqdq xmm3, xmm4") + o.append(f"movups xmm4, xmmword ptr [{t.arg64(1)}]") + o.append(f"movups xmm5, xmmword ptr [{t.arg64(1)}+0x10]") + o.append(f"movaps xmm8, xmm4") + o.append(f"shufps xmm4, xmm5, 136") + o.append(f"shufps xmm8, xmm5, 221") + o.append(f"movaps xmm5, xmm8") + o.append(f"movups xmm6, xmmword ptr [{t.arg64(1)}+0x20]") + o.append(f"movups xmm7, xmmword ptr [{t.arg64(1)}+0x30]") + o.append(f"movaps xmm8, xmm6") + o.append(f"shufps xmm6, xmm7, 136") + o.append(f"pshufd xmm6, xmm6, 0x93") + o.append(f"shufps xmm8, xmm7, 221") + o.append(f"pshufd xmm7, xmm8, 0x93") + else: + raise NotImplementedError + + +def compress_finish(t, o): + if t.extension == AVX512: + o.append(f"vmovdqu xmmword ptr [{t.arg64(0)}], xmm0") + o.append(f"vmovdqu xmmword ptr [{t.arg64(0)}+0x10], xmm1") + elif t.extension in (SSE41, SSE2): + o.append(f"movups xmmword ptr [{t.arg64(0)}], xmm0") + o.append(f"movups xmmword ptr [{t.arg64(0)}+0x10], xmm1") + else: + raise NotImplementedError + + +def compress(t, o): + name = f"blake3_{t.extension}_compress" + o.append(f".global {name}") + o.append(f"{name}:") + compress_setup(t, o) + o.append(f"call blake3_{t.extension}_kernel_1") + compress_finish(t, o) + o.append(t.ret()) + + +def emit_prelude(t, o): + # o.append(".intel_syntax noprefix") + pass + + +def emit_sse2(t, o): + t = replace(t, extension=SSE2) + kernel_1(t, o) + compress(t, o) + o.append(".balign 16") + o.append("PBLENDW_0x33_MASK:") + o.append(".long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000") + o.append("PBLENDW_0xCC_MASK:") + o.append(".long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF") + o.append("PBLENDW_0x3F_MASK:") + o.append(".long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000") + o.append("PBLENDW_0xC0_MASK:") + o.append(".long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF") + + +def emit_sse41(t, o): + t = replace(t, extension=SSE41) + kernel_1(t, o) + compress(t, o) + o.append(".balign 16") + o.append("ROT16:") + o.append(".byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13") + o.append("ROT8:") + o.append(".byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12") + + +def emit_avx2(t, o): + t = replace(t, extension=AVX2) + + +def emit_avx512(t, o): + t = replace(t, extension=AVX512) + kernel_1(t, o) + compress(t, o) + + +def emit_footer(t, o): + o.append(".balign 16") + o.append("BLAKE3_IV:") + o.append("BLAKE3_IV_0:") + o.append(".long 0x6A09E667") + o.append("BLAKE3_IV_1:") + o.append(".long 0xBB67AE85") + o.append("BLAKE3_IV_2:") + o.append(".long 0x3C6EF372") + o.append("BLAKE3_IV_3:") + o.append(".long 0xA54FF53A") + + +def format(output): + print("# DO NOT EDIT") + print("# This file is generated by asm.py.") + for item in output: + if ":" in item or item[0] == ".": + print(item) + else: + print(" " * 8 + item) + + +def main(): + target = Target(os=LINUX, arch=X86_64, extension=None) + output = [] + + emit_prelude(target, output) + emit_sse2(target, output) + emit_sse41(target, output) + emit_avx2(target, output) + emit_avx512(target, output) + emit_footer(target, output) + + format(output) + + +if __name__ == "__main__": + main() diff --git a/asm/out.S b/asm/out.S new file mode 100644 index 0000000..20cacfc --- /dev/null +++ b/asm/out.S @@ -0,0 +1,1359 @@ +# DO NOT EDIT +# This file is generated by asm.py. +blake3_sse2_kernel_1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm11, xmm3 + pslld xmm3, 24 + psrld xmm11, 8 + por xmm3, xmm11 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + pxor xmm0, xmm2 + pxor xmm1, xmm3 + ret +.global blake3_sse2_compress +blake3_sse2_compress: + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + mov ecx, ecx + or rcx, r8 + vmovq xmm3, rdx + vmovq xmm4, rcx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + call blake3_sse2_kernel_1 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret +.balign 16 +PBLENDW_0x33_MASK: +.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: +.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: +.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: +.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF +blake3_sse41_kernel_1: + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + pxor xmm0, xmm2 + pxor xmm1, xmm3 + ret +.global blake3_sse41_compress +blake3_sse41_compress: + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + mov ecx, ecx + or rcx, r8 + vmovq xmm3, rdx + vmovq xmm4, rcx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + call blake3_sse41_kernel_1 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret +.balign 16 +ROT16: +.byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: +.byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +blake3_avx512_kernel_1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + vpxord xmm0, xmm0, xmm2 + vpxord xmm1, xmm1, xmm3 + ret +.global blake3_avx512_compress +blake3_avx512_compress: + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + shl r8, 32 + mov ecx, ecx + or rcx, r8 + vmovq xmm3, rdx + vmovq xmm4, rcx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + call blake3_avx512_kernel_1 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret +.balign 16 +BLAKE3_IV: +BLAKE3_IV_0: +.long 0x6A09E667 +BLAKE3_IV_1: +.long 0xBB67AE85 +BLAKE3_IV_2: +.long 0x3C6EF372 +BLAKE3_IV_3: +.long 0xA54FF53A diff --git a/benches/bench.rs b/benches/bench.rs index 68675c4..32e6319 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -108,6 +108,40 @@ fn bench_single_compression_avx512(b: &mut Bencher) { } } +fn bench_kernel_compression_fn(b: &mut Bencher, f: blake3::kernel::CompressionFn) { + let mut state = [1u32; 8]; + let mut r = RandomInput::new(b, 64); + let input = array_ref!(r.get(), 0, 64); + b.iter(|| unsafe { f(&mut state, input, 64, 0, 0) }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_kernel_compression_sse2(b: &mut Bencher) { + if !is_x86_feature_detected!("sse2") { + return; + } + bench_kernel_compression_fn(b, blake3::kernel::blake3_sse2_compress); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_kernel_compression_sse41(b: &mut Bencher) { + if !is_x86_feature_detected!("sse4.1") { + return; + } + bench_kernel_compression_fn(b, blake3::kernel::blake3_sse41_compress); +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_kernel_compression_avx512(b: &mut Bencher) { + if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") { + return; + } + bench_kernel_compression_fn(b, blake3::kernel::blake3_avx512_compress); +} + fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); diff --git a/src/kernel.rs b/src/kernel.rs index 68c478d..0be9df3 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -1,6 +1,91 @@ use crate::CHUNK_LEN; use std::arch::{asm, global_asm}; +global_asm!(include_str!("../asm/out.S")); + +extern "C" { + pub fn blake3_sse2_compress( + cv: &[u32; 8], + block: &[u8; 64], + counter: u64, + block_len: u32, + flags: u32, + ); + pub fn blake3_sse41_compress( + cv: &[u32; 8], + block: &[u8; 64], + counter: u64, + block_len: u32, + flags: u32, + ); + pub fn blake3_avx512_compress( + cv: &[u32; 8], + block: &[u8; 64], + counter: u64, + block_len: u32, + flags: u32, + ); +} + +pub type CompressionFn = + unsafe extern "C" fn(cv: &[u32; 8], block: &[u8; 64], counter: u64, block_len: u32, flags: u32); + +#[cfg(test)] +mod test { + use super::*; + + fn test_compression_function(f: CompressionFn) { + let mut block = [0; 64]; + let block_len = 53; + crate::test::paint_test_input(&mut block[..block_len]); + let counter = u64::MAX - 42; + let flags = crate::CHUNK_START | crate::CHUNK_END | crate::ROOT; + + let mut expected = *crate::IV; + crate::platform::Platform::Portable.compress_in_place( + &mut expected, + &block, + block_len as u8, + counter, + flags, + ); + + let mut found = *crate::IV; + unsafe { + f(&mut found, &block, counter, block_len as u32, flags as u32); + } + + assert_eq!(expected, found); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_sse2_compress() { + if !is_x86_feature_detected!("sse2") { + return; + } + test_compression_function(blake3_sse2_compress); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_sse41_compress() { + if !is_x86_feature_detected!("sse4.1") { + return; + } + test_compression_function(blake3_sse41_compress); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx512_compress() { + if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") { + return; + } + test_compression_function(blake3_avx512_compress); + } +} + global_asm!( // -------------------------------------------------------------------------------------------- // blake3_avx512_kernel_16 |
