aboutsummaryrefslogtreecommitdiff
path: root/c
diff options
context:
space:
mode:
Diffstat (limited to 'c')
-rw-r--r--c/blake3_avx512_x86-64_unix.S588
-rw-r--r--c/blake3_avx512_x86-64_windows_gnu.S702
-rw-r--r--c/blake3_avx512_x86-64_windows_msvc.asm698
3 files changed, 986 insertions, 1002 deletions
diff --git a/c/blake3_avx512_x86-64_unix.S b/c/blake3_avx512_x86-64_unix.S
index 9b82424..868e9f8 100644
--- a/c/blake3_avx512_x86-64_unix.S
+++ b/c/blake3_avx512_x86-64_unix.S
@@ -46,7 +46,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm0, r8d
shr r8, 0x20
vpbroadcastd ymm1, r8d
- vmovdqa32 ymm2 {k1} {z}, ymmword ptr [rip+ADD0+ 0]
+ vmovdqa32 ymm2 {k1} {z}, ymmword ptr [rip+ADD0+0]
vmovdqa32 ymm3 {k1} {z}, ymmword ptr [rip+ADD0+32]
vpaddd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp], ymm2
@@ -61,8 +61,8 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x60], ymm1
shl rdx, 0x06
mov qword ptr [rsp+0x100], rdx
- cmp rsi, 0x10
- jb 5f
+ cmp rsi, 0x08
+ jbe 5f
.p2align 5
2:
vpbroadcastd zmm0, dword ptr [rcx]
@@ -89,39 +89,60 @@ blake3_hash_many_avx512:
mov r8, qword ptr [rdi+0x10]
mov r9, qword ptr [rdi+0x18]
mov r10, qword ptr [rdi+0x40]
- mov r11, qword ptr [rdi+0x48]
- mov r12, qword ptr [rdi+0x50]
- mov r13, qword ptr [rdi+0x58]
vmovdqu32 ymm8, ymmword ptr [rax+rdx*1-0x40]
vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rdx*1-0x40], 0x01
vmovdqu32 ymm9, ymmword ptr [rbx+rdx*1-0x40]
+ cmp rsi, 0x0A
+ jb 4f
+ mov r11, qword ptr [rdi+0x48]
vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rdx*1-0x40], 0x01
+4:
vpunpckldq zmm10, zmm8, zmm9
vpunpckhdq zmm11, zmm8, zmm9
vmovdqu32 ymm8, ymmword ptr [r8+rdx*1-0x40]
+ cmp rsi, 0x0B
+ jb 4f
+ mov r12, qword ptr [rdi+0x50]
vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rdx*1-0x40], 0x01
+4:
vmovdqu32 ymm9, ymmword ptr [r9+rdx*1-0x40]
+ cmp rsi, 0x0C
+ jb 4f
+ mov r13, qword ptr [rdi+0x58]
vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rdx*1-0x40], 0x01
+4:
vpunpckldq zmm12, zmm8, zmm9
vpunpckhdq zmm13, zmm8, zmm9
mov rax, qword ptr [rdi+0x20]
mov rbx, qword ptr [rdi+0x28]
mov r8, qword ptr [rdi+0x30]
mov r9, qword ptr [rdi+0x38]
- mov r10, qword ptr [rdi+0x60]
- mov r11, qword ptr [rdi+0x68]
- mov r12, qword ptr [rdi+0x70]
- mov r13, qword ptr [rdi+0x78]
vmovdqu32 ymm8, ymmword ptr [rax+rdx*1-0x40]
+ cmp rsi, 0x0D
+ jb 4f
+ mov r10, qword ptr [rdi+0x60]
vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rdx*1-0x40], 0x01
+4:
vmovdqu32 ymm9, ymmword ptr [rbx+rdx*1-0x40]
+ cmp rsi, 0x0E
+ jb 4f
+ mov r11, qword ptr [rdi+0x68]
vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rdx*1-0x40], 0x01
+4:
vpunpckldq zmm14, zmm8, zmm9
vpunpckhdq zmm15, zmm8, zmm9
vmovdqu32 ymm8, ymmword ptr [r8+rdx*1-0x40]
+ cmp rsi, 0x0F
+ jb 4f
+ mov r12, qword ptr [rdi+0x70]
vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rdx*1-0x40], 0x01
+4:
vmovdqu32 ymm9, ymmword ptr [r9+rdx*1-0x40]
+ cmp rsi, 0x10
+ jb 4f
+ mov r13, qword ptr [rdi+0x78]
vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rdx*1-0x40], 0x01
+4:
vpunpckldq zmm16, zmm8, zmm9
vpunpckhdq zmm17, zmm8, zmm9
vmovdqa32 zmm8, zmmword ptr [rip+INDEX0]
@@ -151,19 +172,31 @@ blake3_hash_many_avx512:
mov r8, qword ptr [rdi+0x10]
mov r9, qword ptr [rdi+0x18]
mov r10, qword ptr [rdi+0x40]
- mov r11, qword ptr [rdi+0x48]
- mov r12, qword ptr [rdi+0x50]
- mov r13, qword ptr [rdi+0x58]
vmovdqu32 ymm11, ymmword ptr [rax+rdx*1-0x20]
vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rdx*1-0x20], 0x01
vmovdqu32 ymm13, ymmword ptr [rbx+rdx*1-0x20]
+ cmp rsi, 0x0A
+ jb 4f
+ mov r11, qword ptr [rdi+0x48]
vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r11+rdx*1+0x80]
+4:
vpunpckldq zmm15, zmm11, zmm13
vpunpckhdq zmm17, zmm11, zmm13
vmovdqu32 ymm11, ymmword ptr [r8+rdx*1-0x20]
+ cmp rsi, 0x0B
+ jb 4f
+ mov r12, qword ptr [rdi+0x50]
vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r13+rdx*1+0x80]
+4:
vmovdqu32 ymm13, ymmword ptr [r9+rdx*1-0x20]
+ cmp rsi, 0x0C
+ jb 4f
+ mov r13, qword ptr [rdi+0x58]
vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r13+rdx*1+0x80]
+4:
vpunpckldq zmm22, zmm11, zmm13
vpunpckhdq zmm23, zmm11, zmm13
prefetcht0 byte ptr [rax+rdx*1+0x80]
@@ -171,33 +204,42 @@ blake3_hash_many_avx512:
prefetcht0 byte ptr [r8+rdx*1+0x80]
prefetcht0 byte ptr [r9+rdx*1+0x80]
prefetcht0 byte ptr [r10+rdx*1+0x80]
- prefetcht0 byte ptr [r11+rdx*1+0x80]
- prefetcht0 byte ptr [r12+rdx*1+0x80]
- prefetcht0 byte ptr [r13+rdx*1+0x80]
mov rax, qword ptr [rdi+0x20]
mov rbx, qword ptr [rdi+0x28]
mov r8, qword ptr [rdi+0x30]
mov r9, qword ptr [rdi+0x38]
- mov r10, qword ptr [rdi+0x60]
- mov r11, qword ptr [rdi+0x68]
- mov r12, qword ptr [rdi+0x70]
- mov r13, qword ptr [rdi+0x78]
vmovdqu32 ymm11, ymmword ptr [rax+rdx*1-0x20]
+ cmp rsi, 0x0D
+ jb 4f
+ mov r10, qword ptr [rdi+0x60]
vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r10+rdx*1+0x80]
+4:
vmovdqu32 ymm13, ymmword ptr [rbx+rdx*1-0x20]
+ cmp rsi, 0x0E
+ jb 4f
+ mov r11, qword ptr [rdi+0x68]
vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r11+rdx*1+0x80]
+4:
vpunpckldq zmm24, zmm11, zmm13
vpunpckhdq zmm25, zmm11, zmm13
vmovdqu32 ymm11, ymmword ptr [r8+rdx*1-0x20]
+ cmp rsi, 0x0F
+ jb 4f
+ mov r12, qword ptr [rdi+0x70]
vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r12+rdx*1+0x80]
+4:
vmovdqu32 ymm13, ymmword ptr [r9+rdx*1-0x20]
+ cmp rsi, 0x10
+ jb 4f
+ mov r13, qword ptr [rdi+0x78]
vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rdx*1-0x20], 0x01
+ prefetcht0 byte ptr [r13+rdx*1+0x80]
+4:
vpunpckldq zmm26, zmm11, zmm13
vpunpckhdq zmm27, zmm11, zmm13
- prefetcht0 byte ptr [rax+rdx*1+0x80]
- prefetcht0 byte ptr [rbx+rdx*1+0x80]
- prefetcht0 byte ptr [r8+rdx*1+0x80]
- prefetcht0 byte ptr [r9+rdx*1+0x80]
prefetcht0 byte ptr [r10+rdx*1+0x80]
prefetcht0 byte ptr [r11+rdx*1+0x80]
prefetcht0 byte ptr [r12+rdx*1+0x80]
@@ -372,6 +414,7 @@ blake3_hash_many_avx512:
vpxord zmm6, zmm6, zmm30
vpxord zmm7, zmm7, zmm31
movzx eax, byte ptr [rbp+0x38]
+ cmp rdx, qword ptr [rsp+0x100]
jb 3b
mov rbx, qword ptr [rbp+0x50]
vpunpckldq zmm8, zmm0, zmm2
@@ -413,12 +456,26 @@ blake3_hash_many_avx512:
vextracti64x4 ymmword ptr [rbx+0xC0], zmm2, 0x00
vextracti64x4 ymmword ptr [rbx+0xE0], zmm3, 0x00
vextracti64x4 ymmword ptr [rbx+0x100], zmm8, 0x01
+ cmp rsi, 0x0A
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x120], zmm10, 0x01
+ cmp rsi, 0x0B
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x140], zmm12, 0x01
+ cmp rsi, 0x0C
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x160], zmm14, 0x01
+ cmp rsi, 0x0D
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x180], zmm0, 0x01
+ cmp rsi, 0x0E
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x1A0], zmm1, 0x01
+ cmp rsi, 0x0F
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x1C0], zmm2, 0x01
+ cmp rsi, 0x10
+ jb 9f
vextracti64x4 ymmword ptr [rbx+0x1E0], zmm3, 0x01
vmovdqa32 zmm8, zmmword ptr [rsp]
vmovdqa32 zmm9, zmmword ptr [rsp+0x40]
@@ -432,8 +489,8 @@ blake3_hash_many_avx512:
mov qword ptr [rbp+0x50], rbx
add rdi, 0x80
sub rsi, 0x10
- cmp rsi, 0x10
- jnb 2b
+ cmp rsi, 0x08
+ jnbe 2b
test esi, esi
jnz 5f
9:
@@ -448,9 +505,8 @@ blake3_hash_many_avx512:
ret
.p2align 6
5:
- mov rax, rsp
- test sil, 0x08
- jz 3f
+ cmp sil, 0x04
+ jbe 3f
vpbroadcastd ymm0, dword ptr [rcx]
vpbroadcastd ymm1, dword ptr [rcx+0x04]
vpbroadcastd ymm2, dword ptr [rcx+0x08]
@@ -459,45 +515,50 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
- movzx edx, byte ptr [rbp+0x38]
- movzx ebx, byte ptr [rbp+0x40]
- or edx, ebx
- xor ebx, ebx
+ movzx eax, byte ptr [rbp+0x38]
+ movzx edx, byte ptr [rbp+0x40]
+ or eax, edx
+ xor edx, edx
2:
- movzx r8d, byte ptr [rbp+0x48]
- or r8d, edx
- add rbx, 0x40
- cmp rbx, qword ptr [rsp+0x100]
- cmovz edx, r8d
- mov dword ptr [rsp+0x80], edx
- mov edx, 0xCC
- kmovw k2, edx
- mov edx, 0x33
- kmovw k3, edx
- mov rdx, qword ptr [rdi]
- mov r8, qword ptr [rdi+0x20]
- vmovups xmm8, xmmword ptr [rdx+rbx*1-0x40]
- vinserti32x4 ymm8, ymm8, xmmword ptr [r8+rbx*1-0x40], 0x01
- vmovups xmm12, xmmword ptr [rdx+rbx*1-0x30]
- vinserti32x4 ymm12, ymm12, xmmword ptr [r8+rbx*1-0x30], 0x01
- mov rdx, qword ptr [rdi+0x08]
- mov r8, qword ptr [rdi+0x28]
- vmovups xmm9, xmmword ptr [rdx+rbx*1-0x40]
- vinserti32x4 ymm9, ymm9, xmmword ptr [r8+rbx*1-0x40], 0x01
- vmovups xmm13, xmmword ptr [rdx+rbx*1-0x30]
- vinserti32x4 ymm13, ymm13, xmmword ptr [r8+rbx*1-0x30], 0x01
- mov rdx, qword ptr [rdi+0x10]
- mov r8, qword ptr [rdi+0x30]
- vmovups xmm10, xmmword ptr [rdx+rbx*1-0x40]
- vinserti32x4 ymm10, ymm10, xmmword ptr [r8+rbx*1-0x40], 0x01
- vmovups xmm14, xmmword ptr [rdx+rbx*1-0x30]
- vinserti32x4 ymm14, ymm14, xmmword ptr [r8+rbx*1-0x30], 0x01
- mov rdx, qword ptr [rdi+0x18]
- mov r8, qword ptr [rdi+0x38]
- vmovups xmm11, xmmword ptr [rdx+rbx*1-0x40]
- vinserti32x4 ymm11, ymm11, xmmword ptr [r8+rbx*1-0x40], 0x01
- vmovups xmm15, xmmword ptr [rdx+rbx*1-0x30]
- vinserti32x4 ymm15, ymm15, xmmword ptr [r8+rbx*1-0x30], 0x01
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 0x40
+ cmp rdx, qword ptr [rsp+0x100]
+ cmovz eax, ebx
+ mov dword ptr [rsp+0x80], eax
+ mov rax, qword ptr [rdi]
+ mov rbx, qword ptr [rdi+0x20]
+ vmovups xmm8, xmmword ptr [rax+rdx*1-0x40]
+ vinserti32x4 ymm8, ymm8, xmmword ptr [rbx+rdx*1-0x40], 0x01
+ vmovups xmm12, xmmword ptr [rax+rdx*1-0x30]
+ vinserti32x4 ymm12, ymm12, xmmword ptr [rbx+rdx*1-0x30], 0x01
+ mov rax, qword ptr [rdi+0x08]
+ vmovups xmm9, xmmword ptr [rax+rdx*1-0x40]
+ vmovups xmm13, xmmword ptr [rax+rdx*1-0x30]
+ cmp sil, 0x06
+ jb 4f
+ mov rbx, qword ptr [rdi+0x28]
+ vinserti32x4 ymm9, ymm9, xmmword ptr [rbx+rdx*1-0x40], 0x01
+ vinserti32x4 ymm13, ymm13, xmmword ptr [rbx+rdx*1-0x30], 0x01
+4:
+ mov rax, qword ptr [rdi+0x10]
+ vmovups xmm10, xmmword ptr [rax+rdx*1-0x40]
+ vmovups xmm14, xmmword ptr [rax+rdx*1-0x30]
+ cmp sil, 0x07
+ jb 4f
+ mov rbx, qword ptr [rdi+0x30]
+ vinserti32x4 ymm10, ymm10, xmmword ptr [rbx+rdx*1-0x40], 0x01
+ vinserti32x4 ymm14, ymm14, xmmword ptr [rbx+rdx*1-0x30], 0x01
+4:
+ mov rax, qword ptr [rdi+0x18]
+ vmovups xmm11, xmmword ptr [rax+rdx*1-0x40]
+ vmovups xmm15, xmmword ptr [rax+rdx*1-0x30]
+ cmp sil, 0x08
+ jb 4f
+ mov rbx, qword ptr [rdi+0x38]
+ vinserti32x4 ymm11, ymm11, xmmword ptr [rbx+rdx*1-0x40], 0x01
+ vinserti32x4 ymm15, ymm15, xmmword ptr [rbx+rdx*1-0x30], 0x01
+4:
vpunpckldq ymm24, ymm8, ymm9
vpunpckhdq ymm9, ymm8, ymm9
vpunpckldq ymm8, ymm10, ymm11
@@ -514,30 +575,39 @@ blake3_hash_many_avx512:
vshufps ymm12, ymm10, ymm12, 0xEE
vshufps ymm10, ymm13, ymm15, 0x44
vshufps ymm15, ymm13, ymm15, 0xEE
- mov rdx, qword ptr [rdi]
- mov r8, qword ptr [rdi+0x20]
- vmovups xmm16, xmmword ptr [rdx+rbx*1-0x20]
- vinserti32x4 ymm16, ymm16, xmmword ptr [r8+rbx*1-0x20], 0x01
- vmovups xmm20, xmmword ptr [rdx+rbx*1-0x10]
- vinserti32x4 ymm20, ymm20, xmmword ptr [r8+rbx*1-0x10], 0x01
- mov rdx, qword ptr [rdi+0x08]
- mov r8, qword ptr [rdi+0x28]
- vmovups xmm17, xmmword ptr [rdx+rbx*1-0x20]
- vinserti32x4 ymm17, ymm17, xmmword ptr [r8+rbx*1-0x20], 0x01
- vmovups xmm21, xmmword ptr [rdx+rbx*1-0x10]
- vinserti32x4 ymm21, ymm21, xmmword ptr [r8+rbx*1-0x10], 0x01
- mov rdx, qword ptr [rdi+0x10]
- mov r8, qword ptr [rdi+0x30]
- vmovups xmm18, xmmword ptr [rdx+rbx*1-0x20]
- vinserti32x4 ymm18, ymm18, xmmword ptr [r8+rbx*1-0x20], 0x01
- vmovups xmm22, xmmword ptr [rdx+rbx*1-0x10]
- vinserti32x4 ymm22, ymm22, xmmword ptr [r8+rbx*1-0x10], 0x01
- mov rdx, qword ptr [rdi+0x18]
- mov r8, qword ptr [rdi+0x38]
- vmovups xmm19, xmmword ptr [rdx+rbx*1-0x20]
- vinserti32x4 ymm19, ymm19, xmmword ptr [r8+rbx*1-0x20], 0x01
- vmovups xmm23, xmmword ptr [rdx+rbx*1-0x10]
- vinserti32x4 ymm23, ymm23, xmmword ptr [r8+rbx*1-0x10], 0x01
+ mov rax, qword ptr [rdi]
+ mov rbx, qword ptr [rdi+0x20]
+ vmovups xmm16, xmmword ptr [rax+rdx*1-0x20]
+ vinserti32x4 ymm16, ymm16, xmmword ptr [rbx+rdx*1-0x20], 0x01
+ vmovups xmm20, xmmword ptr [rax+rdx*1-0x10]
+ vinserti32x4 ymm20, ymm20, xmmword ptr [rbx+rdx*1-0x10], 0x01
+ mov rax, qword ptr [rdi+0x08]
+ vmovups xmm17, xmmword ptr [rax+rdx*1-0x20]
+ vmovups xmm21, xmmword ptr [rax+rdx*1-0x10]
+ cmp sil, 0x06
+ jb 4f
+ mov rbx, qword ptr [rdi+0x28]
+ vinserti32x4 ymm17, ymm17, xmmword ptr [rbx+rdx*1-0x20], 0x01
+ vinserti32x4 ymm21, ymm21, xmmword ptr [rbx+rdx*1-0x10], 0x01
+4:
+ mov rax, qword ptr [rdi+0x10]
+ vmovups xmm18, xmmword ptr [rax+rdx*1-0x20]
+ vmovups xmm22, xmmword ptr [rax+rdx*1-0x10]
+ cmp sil, 0x07
+ jb 4f
+ mov rbx, qword ptr [rdi+0x30]
+ vinserti32x4 ymm18, ymm18, xmmword ptr [rbx+rdx*1-0x20], 0x01
+ vinserti32x4 ymm22, ymm22, xmmword ptr [rbx+rdx*1-0x10], 0x01
+4:
+ mov rax, qword ptr [rdi+0x18]
+ vmovups xmm19, xmmword ptr [rax+rdx*1-0x20]
+ vmovups xmm23, xmmword ptr [rax+rdx*1-0x10]
+ cmp sil, 0x08
+ jb 4f
+ mov rbx, qword ptr [rdi+0x38]
+ vinserti32x4 ymm19, ymm19, xmmword ptr [rbx+rdx*1-0x20], 0x01
+ vinserti32x4 ymm23, ymm23, xmmword ptr [rbx+rdx*1-0x10], 0x01
+4:
vpunpckldq ymm13, ymm16, ymm17
vpunpckhdq ymm17, ymm16, ymm17
vpunpckldq ymm16, ymm18, ymm19
@@ -558,11 +628,11 @@ blake3_hash_many_avx512:
vpbroadcastd ymm25, dword ptr [rip+BLAKE3_IV_1]
vpbroadcastd ymm26, dword ptr [rip+BLAKE3_IV_2]
vpbroadcastd ymm27, dword ptr [rip+BLAKE3_IV_3]
- vmovdqa32 ymm28, ymmword ptr [rax]
- vmovdqa32 ymm29, ymmword ptr [rax+0x40]
+ vmovdqa32 ymm28, ymmword ptr [rsp]
+ vmovdqa32 ymm29, ymmword ptr [rsp+0x40]
vpbroadcastd ymm30, dword ptr [rip+BLAKE3_BLOCK_LEN]
vpbroadcastd ymm31, dword ptr [rsp+0x80]
- mov dl, 0x07
+ mov al, 0x07
4:
vpaddd ymm0, ymm0, ymm14
vpaddd ymm1, ymm1, ymm24
@@ -694,7 +764,7 @@ blake3_hash_many_avx512:
vprord ymm7, ymm7, 0x07
vprord ymm4, ymm4, 0x07
vmovdqa32 ymm8, ymmword ptr [rsp+0xC0]
- dec dl
+ dec al
jnz 4b
vpxord ymm0, ymm0, ymm21
vpxord ymm1, ymm1, ymm25
@@ -704,79 +774,85 @@ blake3_hash_many_avx512:
vpxord ymm5, ymm5, ymm29
vpxord ymm6, ymm6, ymm30
vpxord ymm7, ymm7, ymm31
- movzx edx, byte ptr [rbp+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ cmp rdx, qword ptr [rsp+0x100]
jb 2b
- mov r8, qword ptr [rbp+0x50]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
+ mov rbx, qword ptr [rbp+0x50]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckldq ymm9, ymm2, ymm3
+ vpunpckhdq ymm10, ymm0, ymm1
+ vpunpckldq ymm11, ymm4, ymm5
+ vpunpckldq ymm0, ymm6, ymm7
vshufps ymm12, ymm8, ymm9, 0x4E
- vblendps ymm1, ymm8, ymm12, 0xCC
+ vpblendd ymm1, ymm8, ymm12, 0xCC
vshufps ymm8, ymm11, ymm0, 0x4E
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0xCC
- vblendps ymm3, ymm12, ymm9, 0xCC
- vperm2f128 ymm12, ymm1, ymm2, 0x20
- vmovups ymmword ptr [r8], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0xCC
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 0x20
- vmovups ymmword ptr [r8+0x20], ymm7
+ vpunpckhdq ymm13, ymm2, ymm3
+ vpblendd ymm2, ymm11, ymm8, 0xCC
+ vpblendd ymm3, ymm12, ymm9, 0xCC
+ vperm2i128 ymm12, ymm1, ymm2, 0x20
+ vmovdqu ymmword ptr [rbx], ymm12
+ vpunpckhdq ymm14, ymm4, ymm5
+ vpblendd ymm4, ymm8, ymm0, 0xCC
+ vpunpckhdq ymm15, ymm6, ymm7
+ vperm2i128 ymm7, ymm3, ymm4, 0x20
+ vmovdqu ymmword ptr [rbx+0x20], ymm7
vshufps ymm5, ymm10, ymm13, 0x4E
- vblendps ymm6, ymm5, ymm13, 0xCC
+ vpblendd ymm6, ymm5, ymm13, 0xCC
vshufps ymm13, ymm14, ymm15, 0x4E
- vblendps ymm10, ymm10, ymm5, 0xCC
- vblendps ymm14, ymm14, ymm13, 0xCC
- vperm2f128 ymm8, ymm10, ymm14, 0x20
- vmovups ymmword ptr [r8+0x40], ymm8
- vblendps ymm15, ymm13, ymm15, 0xCC
- vperm2f128 ymm13, ymm6, ymm15, 0x20
- vmovups ymmword ptr [r8+0x60], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 0x31
- vperm2f128 ymm11, ymm3, ymm4, 0x31
- vmovups ymmword ptr [r8+0x80], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 0x31
- vperm2f128 ymm15, ymm6, ymm15, 0x31
- vmovups ymmword ptr [r8+0xA0], ymm11
- vmovups ymmword ptr [r8+0xC0], ymm14
- vmovups ymmword ptr [r8+0xE0], ymm15
- lea r9, qword ptr [rax+0x20]
- kortestw k1, k1
- cmovnz rax, r9
- add r8, 0x100
- mov qword ptr [rbp+0x50], r8
- add rdi, 0x40
+ vpblendd ymm10, ymm10, ymm5, 0xCC
+ vpblendd ymm14, ymm14, ymm13, 0xCC
+ vperm2i128 ymm8, ymm10, ymm14, 0x20
+ vmovdqu ymmword ptr [rbx+0x40], ymm8
+ vpblendd ymm15, ymm13, ymm15, 0xCC
+ vperm2i128 ymm13, ymm6, ymm15, 0x20
+ vmovdqu ymmword ptr [rbx+0x60], ymm13
+ vperm2i128 ymm9, ymm1, ymm2, 0x31
+ vmovdqu ymmword ptr [rbx+0x80], ymm9
+ cmp sil, 0x06
+ jb 4f
+ vperm2i128 ymm11, ymm3, ymm4, 0x31
+ vmovdqu ymmword ptr [rbx+0xA0], ymm11
+ cmp sil, 0x07
+ jb 4f
+ vperm2i128 ymm14, ymm10, ymm14, 0x31
+ vmovdqu ymmword ptr [rbx+0xC0], ymm14
+ cmp sil, 0x08
+ jb 4f
+ vperm2i128 ymm15, ymm6, ymm15, 0x31
+ vmovdqu ymmword ptr [rbx+0xE0], ymm15
+4:
+ jmp 9b
3:
+ mov rax, qword ptr [rsp+0x100]
mov rdx, qword ptr [rbp+0x50]
movzx ebx, byte ptr [rbp+0x38]
movzx r8d, byte ptr [rbp+0x48]
- test sil, 0x04
- jz 3f
+ mov r9d, 0xAAAA
+ kmovw k2, r9d
+ mov r9d, 0x8888
+ kmovw k3, r9d
+ cmp sil, 0x02
+ jbe 3f
vbroadcasti32x4 zmm0, xmmword ptr [rcx]
vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x10]
vbroadcasti32x4 zmm4, xmmword ptr [rip+BLAKE3_IV]
mov r9d, 0x4444
- kmovw k2, r9d
- vmovdqa xmm6, xmmword ptr [rax]
- vmovdqa xmm7, xmmword ptr [rax+0x40]
+ kmovw k4, r9d
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+0x40]
+ vpbroadcastd zmm5, dword ptr [rip+BLAKE3_BLOCK_LEN]
vpunpckldq xmm8, xmm6, xmm7
- vpunpckhdq xmm9, xmm6, xmm7
- vpermq ymm8, ymm8, 0xDC
- vpermq ymm9, ymm9, 0xDC
- vpbroadcastd zmm6, dword ptr [rip+BLAKE3_BLOCK_LEN]
- vinserti64x4 zmm5, zmm8, ymm9, 0x01
- vpblendmd zmm5 {k2}, zmm5, zmm6
+ vpunpckhdq xmm7, xmm6, xmm7
+ vinserti64x4 zmm8, zmm8, ymm7, 0x01
+ vpermq zmm8, zmm8, 0xDC
+ vpblendmd zmm5 {k4}, zmm8, zmm5
mov r9, qword ptr [rdi]
mov r10, qword ptr [rdi+0x08]
mov r11, qword ptr [rdi+0x10]
+ cmp sil, 0x04
+ jb 4f
mov r12, qword ptr [rdi+0x18]
- mov r13d, 0xAAAA
- kmovw k2, r13d
- mov r13d, 0x8888
- kmovw k3, r13d
+4:
movzx r13d, byte ptr [rbp+0x40]
or r13d, ebx
xor r14d, r14d
@@ -784,32 +860,34 @@ blake3_hash_many_avx512:
movzx r15d, byte ptr [rbp+0x48]
or r15d, r13d
add r14, 0x40
- cmp r14, qword ptr [rsp+0x100]
+ cmp r14, rax
cmovz r13d, r15d
mov dword ptr [rsp+0x80], r13d
vmovdqa32 zmm2, zmm4
- vpbroadcastd zmm6, dword ptr [rsp+0x80]
- vpblendmd zmm3 {k3}, zmm5, zmm6
+ vpblendmd zmm3 {k3}, zmm5, dword ptr [rsp+0x80] {1to16}
vmovdqu32 zmm10, zmmword ptr [r9+r14*1-0x40]
- vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x40], 0x01
- vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x40], 0x02
- vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x40], 0x03
vmovdqu32 zmm11, zmmword ptr [r9+r14*1-0x30]
+ vmovdqu32 zmm12, zmmword ptr [r9+r14*1-0x20]
+ vmovdqu32 zmm13, zmmword ptr [r9+r14*1-0x10]
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x40], 0x01
vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-0x30], 0x01
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r10+r14*1-0x20], 0x01
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r10+r14*1-0x10], 0x01
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x40], 0x02
vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-0x30], 0x02
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r11+r14*1-0x20], 0x02
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r11+r14*1-0x10], 0x02
+ cmp sil, 0x04
+ jb 4f
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x40], 0x03
vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-0x30], 0x03
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r12+r14*1-0x20], 0x03
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r12+r14*1-0x10], 0x03
+4:
vshufps zmm6, zmm10, zmm11, 0x88
vshufps zmm7, zmm10, zmm11, 0xDD
- vmovdqu32 zmm10, zmmword ptr [r9+r14*1-0x20]
- vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x20], 0x01
- vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x20], 0x02
- vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x20], 0x03
- vmovdqu32 zmm11, zmmword ptr [r9+r14*1-0x10]
- vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-0x10], 0x01
- vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-0x10], 0x02
- vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-0x10], 0x03
- vshufps zmm8, zmm10, zmm11, 0x88
- vshufps zmm9, zmm10, zmm11, 0xDD
+ vshufps zmm8, zmm12, zmm13, 0x88
+ vshufps zmm9, zmm12, zmm13, 0xDD
vpshufd zmm8, zmm8, 0x93
vpshufd zmm9, zmm9, 0x93
mov r15b, 0x07
@@ -850,24 +928,25 @@ blake3_hash_many_avx512:
vpshufd zmm2, zmm2, 0x93
dec r15b
jz 4f
- vshufps zmm12, zmm6, zmm7, 0xD6
- vpshufd zmm13, zmm6, 0x0F
- vpshufd zmm6, zmm12, 0x39
- vshufps zmm12, zmm8, zmm9, 0xFA
- vpblendmd zmm13 {k2}, zmm13, zmm12
- vpunpcklqdq zmm12, zmm9, zmm7
- vpblendmd zmm12 {k3}, zmm12, zmm8
- vpshufd zmm12, zmm12, 0x78
+ vshufps zmm14, zmm6, zmm7, 0xD6
+ vpshufd zmm15, zmm6, 0x0F
+ vpshufd zmm6, zmm14, 0x39
+ vshufps zmm14, zmm8, zmm9, 0xFA
+ vpblendmd zmm15 {k2}, zmm15, zmm14
+ vpunpcklqdq zmm14, zmm9, zmm7
+ vpblendmd zmm14 {k3}, zmm14, zmm8
+ vpshufd zmm14, zmm14, 0x78
vpunpckhdq zmm7, zmm7, zmm9
vpunpckldq zmm8, zmm8, zmm7
vpshufd zmm9, zmm8, 0x1E
- vmovdqa32 zmm7, zmm13
- vmovdqa32 zmm8, zmm12
+ vmovdqa32 zmm7, zmm15
+ vmovdqa32 zmm8, zmm14
jmp 4b
4:
vpxord zmm0, zmm0, zmm2
vpxord zmm1, zmm1, zmm3
mov r13d, ebx
+ cmp r14, rax
jb 2b
vmovdqu xmmword ptr [rdx], xmm0
vmovdqu xmmword ptr [rdx+0x10], xmm1
@@ -875,28 +954,33 @@ blake3_hash_many_avx512:
vextracti128 xmmword ptr [rdx+0x30], ymm1, 0x01
vextracti32x4 xmmword ptr [rdx+0x40], zmm0, 0x02
vextracti32x4 xmmword ptr [rdx+0x50], zmm1, 0x02
+ cmp sil, 0x04
+ jb 4f
vextracti32x4 xmmword ptr [rdx+0x60], zmm0, 0x03
vextracti32x4 xmmword ptr [rdx+0x70], zmm1, 0x03
- lea r15, qword ptr [rax+0x10]
- kortestw k1, k1
- cmovnz rax, r15
- add rdx, 0x80
- add rdi, 0x20
+4:
+ jmp 9b
3:
- test sil, 0x02
- jz 3f
+ test sil, sil
+ jz 9b
vbroadcasti128 ymm0, xmmword ptr [rcx]
vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
vbroadcasti128 ymm4, xmmword ptr [rip+BLAKE3_IV]
- vmovd xmm5, dword ptr [rax]
- vpinsrd xmm5, xmm5, dword ptr [rax+0x40], 0x01
- vpinsrd xmm5, xmm5, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02
- vmovd xmm6, dword ptr [rax+0x04]
- vpinsrd xmm6, xmm6, dword ptr [rax+0x44], 0x01
- vpinsrd xmm6, xmm6, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02
- vinserti128 ymm5, ymm5, xmm6, 0x01
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+0x40]
+ mov r9d, 0x40
+ vpbroadcastq ymm5, r9
+ mov r9d, 0x55
+ kmovw k4, r9d
+ vpunpckldq xmm8, xmm6, xmm7
+ vpunpckhdq xmm7, xmm6, xmm7
+ vinserti128 ymm8, ymm8, xmm7, 0x01
+ vpermq ymm5 {k4}, ymm8, 0xDC
mov r9, qword ptr [rdi]
+ cmp sil, 0x02
+ jb 4f
mov r10, qword ptr [rdi+0x08]
+4:
mov r11d, ebx
movzx r12d, byte ptr [rbp+0x40]
or r11d, r12d
@@ -905,24 +989,26 @@ blake3_hash_many_avx512:
movzx r13d, byte ptr [rbp+0x48]
or r13d, r11d
add r12, 0x40
- cmp r12, qword ptr [rsp+0x100]
+ cmp r12, rax
cmovz r11d, r13d
mov dword ptr [rsp+0x80], r11d
vmovdqa ymm2, ymm4
- vpbroadcastd ymm6, dword ptr [rsp+0x80]
- vpblendd ymm3, ymm5, ymm6, 0x88
+ vpblendmd ymm3 {k3}, ymm5, dword ptr [rsp+0x80] {1to8}
vmovdqu ymm10, ymmword ptr [r9+r12*1-0x40]
- vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x40], 0x01
vmovdqu ymm11, ymmword ptr [r9+r12*1-0x30]
+ vmovdqu ymm12, ymmword ptr [r9+r12*1-0x20]
+ vmovdqu ymm13, ymmword ptr [r9+r12*1-0x10]
+ cmp sil, 0x02
+ jb 4f
+ vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x40], 0x01
vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-0x30], 0x01
+ vinserti128 ymm12, ymm12, xmmword ptr [r10+r12*1-0x20], 0x01
+ vinserti128 ymm13, ymm13, xmmword ptr [r10+r12*1-0x10], 0x01
+4:
vshufps ymm6, ymm10, ymm11, 0x88
vshufps ymm7, ymm10, ymm11, 0xDD
- vmovdqu ymm10, ymmword ptr [r9+r12*1-0x20]
- vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x20], 0x01
- vmovdqu ymm11, ymmword ptr [r9+r12*1-0x10]
- vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-0x10], 0x01
- vshufps ymm8, ymm10, ymm11, 0x88
- vshufps ymm9, ymm10, ymm11, 0xDD
+ vshufps ymm8, ymm12, ymm13, 0x88
+ vshufps ymm9, ymm12, ymm13, 0xDD
vpshufd ymm8, ymm8, 0x93
vpshufd ymm9, ymm9, 0x93
mov r13b, 0x07
@@ -981,107 +1067,15 @@ blake3_hash_many_avx512:
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
mov r11d, ebx
+ cmp r12, rax
jb 2b
vmovdqu xmmword ptr [rdx], xmm0
vmovdqu xmmword ptr [rdx+0x10], xmm1
+ cmp sil, 0x02
+ jb 4f
vextracti128 xmmword ptr [rdx+0x20], ymm0, 0x01
vextracti128 xmmword ptr [rdx+0x30], ymm1, 0x01
- lea r13, qword ptr [rax+0x08]
- kortestw k1, k1
- cmovnz rax, r13
- add rdx, 0x40
- add rdi, 0x10
-3:
- test sil, 0x01
- jz 9b
- vmovdqu xmm0, xmmword ptr [rcx]
- vmovdqu xmm1, xmmword ptr [rcx+0x10]
- vmovdqa xmm4, xmmword ptr [rip+BLAKE3_IV]
- vmovd xmm5, dword ptr [rax]
- vpinsrd xmm5, xmm5, dword ptr [rax+0x40], 0x01
- vpinsrd xmm5, xmm5, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02
- mov r9, qword ptr [rdi]
- mov r10d, ebx
- movzx r11d, byte ptr [rbp+0x40]
- or r10d, r11d
- xor r11d, r11d
-2:
- movzx r12d, byte ptr [rbp+0x48]
- or r12d, r10d
- add r11, 0x40
- cmp r11, qword ptr [rsp+0x100]
- cmovz r10d, r12d
- vmovdqa xmm2, xmm4
- vpinsrd xmm3, xmm5, r10d, 0x03
- vmovdqu xmm10, xmmword ptr [r9+r11*1-0x40]
- vmovdqu xmm11, xmmword ptr [r9+r11*1-0x30]
- vshufps xmm6, xmm10, xmm11, 0x88
- vshufps xmm7, xmm10, xmm11, 0xDD
- vmovdqu xmm10, xmmword ptr [r9+r11*1-0x20]
- vmovdqu xmm11, xmmword ptr [r9+r11*1-0x10]
- vshufps xmm8, xmm10, xmm11, 0x88
- vshufps xmm9, xmm10, xmm11, 0xDD
- vpshufd xmm8, xmm8, 0x93
- vpshufd xmm9, xmm9, 0x93
- mov r12b, 0x07
-4:
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x10
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x0C
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x08
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x07
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm8
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x10
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x0C
- vpaddd xmm0, xmm0, xmm9
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x08
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x07
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec r12b
- jz 4f
- vshufps xmm10, xmm6, xmm7, 0xD6
- vpshufd xmm11, xmm6, 0x0F
- vpshufd xmm6, xmm10, 0x39
- vshufps xmm10, xmm8, xmm9, 0xFA
- vpblendd xmm11, xmm11, xmm10, 0xAA
- vpunpcklqdq xmm10, xmm9, xmm7
- vpblendd xmm10, xmm10, xmm8, 0x88
- vpshufd xmm10, xmm10, 0x78
- vpunpckhdq xmm7, xmm7, xmm9
- vpunpckldq xmm8, xmm8, xmm7
- vpshufd xmm9, xmm8, 0x1E
- vmovdqa xmm7, xmm11
- vmovdqa xmm8, xmm10
- jmp 4b
4:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov r10d, ebx
- jb 2b
- vmovdqu xmmword ptr [rdx], xmm0
- vmovdqu xmmword ptr [rdx+0x10], xmm1
jmp 9b
.p2align 6
@@ -1658,8 +1652,8 @@ _blake3_xof_many_avx512:
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x40], zmm1
add r9, 0x400
- cmp rax, 0x18
- lea rax, qword ptr [rax-0x10]
+ sub rax, 0x10
+ cmp rax, 0x08
jnbe 3b
test al, al
jnz 2f
diff --git a/c/blake3_avx512_x86-64_windows_gnu.S b/c/blake3_avx512_x86-64_windows_gnu.S
index 089ba83..419bbda 100644
--- a/c/blake3_avx512_x86-64_windows_gnu.S
+++ b/c/blake3_avx512_x86-64_windows_gnu.S
@@ -23,16 +23,16 @@ blake3_hash_many_avx512:
push r15
mov rbp, rsp
sub rsp, 0x1E8
- movdqa xmmword ptr [rbp-0xA8], xmm6
- movdqa xmmword ptr [rbp-0x98], xmm7
- movdqa xmmword ptr [rbp-0x88], xmm8
- movdqa xmmword ptr [rbp-0x78], xmm9
- movdqa xmmword ptr [rbp-0x68], xmm10
- movdqa xmmword ptr [rbp-0x58], xmm11
- movdqa xmmword ptr [rbp-0x48], xmm12
- movdqa xmmword ptr [rbp-0x38], xmm13
- movdqa xmmword ptr [rbp-0x28], xmm14
- movdqa xmmword ptr [rbp-0x18], xmm15
+ movaps xmmword ptr [rbp-0xA8], xmm6
+ movaps xmmword ptr [rbp-0x98], xmm7
+ movaps xmmword ptr [rbp-0x88], xmm8
+ movaps xmmword ptr [rbp-0x78], xmm9
+ movaps xmmword ptr [rbp-0x68], xmm10
+ movaps xmmword ptr [rbp-0x58], xmm11
+ movaps xmmword ptr [rbp-0x48], xmm12
+ movaps xmmword ptr [rbp-0x38], xmm13
+ movaps xmmword ptr [rbp-0x28], xmm14
+ movaps xmmword ptr [rbp-0x18], xmm15
and rsp, 0xFFFFFFFFFFFFFFC0
mov rax, qword ptr [rbp+0x68]
movzx ebx, byte ptr [rbp+0x70]
@@ -41,7 +41,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm0, eax
shr rax, 0x20
vpbroadcastd ymm1, eax
- vmovdqa32 ymm2 {k1} {z}, ymmword ptr [rip+ADD0]
+ vmovdqa32 ymm2 {k1} {z}, ymmword ptr [rip+ADD0+0]
vmovdqa32 ymm3 {k1} {z}, ymmword ptr [rip+ADD0+32]
vpaddd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp], ymm2
@@ -56,8 +56,8 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x60], ymm1
shl r8, 0x06
mov qword ptr [rsp+0x100], r8
- cmp rdx, 0x10
- jb 5f
+ cmp rdx, 0x08
+ jbe 5f
.p2align 5
2:
vpbroadcastd zmm0, dword ptr [r9]
@@ -84,43 +84,64 @@ blake3_hash_many_avx512:
mov rdi, qword ptr [rcx+0x10]
mov r8, qword ptr [rcx+0x18]
mov r10, qword ptr [rcx+0x40]
- mov r11, qword ptr [rcx+0x48]
- mov r12, qword ptr [rcx+0x50]
- mov r13, qword ptr [rcx+0x58]
vmovdqu32 ymm8, ymmword ptr [rax+rbx*1-0x40]
vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rbx*1-0x40], 0x01
vmovdqu32 ymm9, ymmword ptr [rsi+rbx*1-0x40]
+ cmp rdx, 0x0A
+ jb 4f
+ mov r11, qword ptr [rcx+0x48]
vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rbx*1-0x40], 0x01
+4:
vpunpckldq zmm10, zmm8, zmm9
vpunpckhdq zmm11, zmm8, zmm9
vmovdqu32 ymm8, ymmword ptr [rdi+rbx*1-0x40]
+ cmp rdx, 0x0B
+ jb 4f
+ mov r12, qword ptr [rcx+0x50]
vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rbx*1-0x40], 0x01
+4:
vmovdqu32 ymm9, ymmword ptr [r8+rbx*1-0x40]
+ cmp rdx, 0x0C
+ jb 4f
+ mov r13, qword ptr [rcx+0x58]
vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rbx*1-0x40], 0x01
+4:
vpunpckldq zmm12, zmm8, zmm9
vpunpckhdq zmm13, zmm8, zmm9
mov rax, qword ptr [rcx+0x20]
mov rsi, qword ptr [rcx+0x28]
mov rdi, qword ptr [rcx+0x30]
mov r8, qword ptr [rcx+0x38]
- mov r10, qword ptr [rcx+0x60]
- mov r11, qword ptr [rcx+0x68]
- mov r12, qword ptr [rcx+0x70]
- mov r13, qword ptr [rcx+0x78]
vmovdqu32 ymm8, ymmword ptr [rax+rbx*1-0x40]
+ cmp rdx, 0x0D
+ jb 4f
+ mov r10, qword ptr [rcx+0x60]
vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rbx*1-0x40], 0x01
+4:
vmovdqu32 ymm9, ymmword ptr [rsi+rbx*1-0x40]
+ cmp rdx, 0x0E
+ jb 4f
+ mov r11, qword ptr [rcx+0x68]
vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rbx*1-0x40], 0x01
+4:
vpunpckldq zmm14, zmm8, zmm9
vpunpckhdq zmm15, zmm8, zmm9
vmovdqu32 ymm8, ymmword ptr [rdi+rbx*1-0x40]
+ cmp rdx, 0x0F
+ jb 4f
+ mov r12, qword ptr [rcx+0x70]
vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rbx*1-0x40], 0x01
+4:
vmovdqu32 ymm9, ymmword ptr [r8+rbx*1-0x40]
+ cmp rdx, 0x10
+ jb 4f
+ mov r13, qword ptr [rcx+0x78]
vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rbx*1-0x40], 0x01
+4:
vpunpckldq zmm16, zmm8, zmm9
vpunpckhdq zmm17, zmm8, zmm9
- vmovdqa32 zmm8, zmmword ptr [0x0000000000000AC0]
- vmovdqa32 zmm9, zmmword ptr [0x0000000000000B00]
+ vmovdqa32 zmm8, zmmword ptr [rip+INDEX0]
+ vmovdqa32 zmm9, zmmword ptr [rip+INDEX1]
vpunpcklqdq zmm18, zmm10, zmm12
vpunpcklqdq zmm20, zmm14, zmm16
vmovdqa32 zmm19, zmm18
@@ -146,19 +167,31 @@ blake3_hash_many_avx512:
mov rdi, qword ptr [rcx+0x10]
mov r8, qword ptr [rcx+0x18]
mov r10, qword ptr [rcx+0x40]
- mov r11, qword ptr [rcx+0x48]
- mov r12, qword ptr [rcx+0x50]
- mov r13, qword ptr [rcx+0x58]
vmovdqu32 ymm11, ymmword ptr [rax+rbx*1-0x20]
vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rbx*1-0x20], 0x01
vmovdqu32 ymm13, ymmword ptr [rsi+rbx*1-0x20]
+ cmp rdx, 0x0A
+ jb 4f
+ mov r11, qword ptr [rcx+0x48]
vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r11+rbx*1+0x80]
+4:
vpunpckldq zmm15, zmm11, zmm13
vpunpckhdq zmm17, zmm11, zmm13
vmovdqu32 ymm11, ymmword ptr [rdi+rbx*1-0x20]
+ cmp rdx, 0x0B
+ jb 4f
+ mov r12, qword ptr [rcx+0x50]
vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r13+rbx*1+0x80]
+4:
vmovdqu32 ymm13, ymmword ptr [r8+rbx*1-0x20]
+ cmp rdx, 0x0C
+ jb 4f
+ mov r13, qword ptr [rcx+0x58]
vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r13+rbx*1+0x80]
+4:
vpunpckldq zmm22, zmm11, zmm13
vpunpckhdq zmm23, zmm11, zmm13
prefetcht0 byte ptr [rax+rbx*1+0x80]
@@ -166,33 +199,42 @@ blake3_hash_many_avx512:
prefetcht0 byte ptr [rdi+rbx*1+0x80]
prefetcht0 byte ptr [r8+rbx*1+0x80]
prefetcht0 byte ptr [r10+rbx*1+0x80]
- prefetcht0 byte ptr [r11+rbx*1+0x80]
- prefetcht0 byte ptr [r12+rbx*1+0x80]
- prefetcht0 byte ptr [r13+rbx*1+0x80]
mov rax, qword ptr [rcx+0x20]
mov rsi, qword ptr [rcx+0x28]
mov rdi, qword ptr [rcx+0x30]
mov r8, qword ptr [rcx+0x38]
- mov r10, qword ptr [rcx+0x60]
- mov r11, qword ptr [rcx+0x68]
- mov r12, qword ptr [rcx+0x70]
- mov r13, qword ptr [rcx+0x78]
vmovdqu32 ymm11, ymmword ptr [rax+rbx*1-0x20]
+ cmp rdx, 0x0D
+ jb 4f
+ mov r10, qword ptr [rcx+0x60]
vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r10+rbx*1+0x80]
+4:
vmovdqu32 ymm13, ymmword ptr [rsi+rbx*1-0x20]
+ cmp rdx, 0x0E
+ jb 4f
+ mov r11, qword ptr [rcx+0x68]
vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r11+rbx*1+0x80]
+4:
vpunpckldq zmm24, zmm11, zmm13
vpunpckhdq zmm25, zmm11, zmm13
vmovdqu32 ymm11, ymmword ptr [rdi+rbx*1-0x20]
+ cmp rdx, 0x0F
+ jb 4f
+ mov r12, qword ptr [rcx+0x70]
vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r12+rbx*1+0x80]
+4:
vmovdqu32 ymm13, ymmword ptr [r8+rbx*1-0x20]
+ cmp rdx, 0x10
+ jb 4f
+ mov r13, qword ptr [rcx+0x78]
vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rbx*1-0x20], 0x01
+ prefetcht0 byte ptr [r13+rbx*1+0x80]
+4:
vpunpckldq zmm26, zmm11, zmm13
vpunpckhdq zmm27, zmm11, zmm13
- prefetcht0 byte ptr [rax+rbx*1+0x80]
- prefetcht0 byte ptr [rsi+rbx*1+0x80]
- prefetcht0 byte ptr [rdi+rbx*1+0x80]
- prefetcht0 byte ptr [r8+rbx*1+0x80]
prefetcht0 byte ptr [r10+rbx*1+0x80]
prefetcht0 byte ptr [r11+rbx*1+0x80]
prefetcht0 byte ptr [r12+rbx*1+0x80]
@@ -216,13 +258,13 @@ blake3_hash_many_avx512:
vpunpckhqdq zmm26, zmm25, zmm27
vpermi2d zmm8, zmm24, zmm26
vpermi2d zmm9, zmm24, zmm26
- vpbroadcastd zmm17, dword ptr [0x0000000000000B80]
- vpbroadcastd zmm23, dword ptr [0x0000000000000B84]
- vpbroadcastd zmm24, dword ptr [0x0000000000000B88]
- vpbroadcastd zmm25, dword ptr [0x0000000000000B8C]
+ vpbroadcastd zmm17, dword ptr [rip+BLAKE3_IV_0]
+ vpbroadcastd zmm23, dword ptr [rip+BLAKE3_IV_1]
+ vpbroadcastd zmm24, dword ptr [rip+BLAKE3_IV_2]
+ vpbroadcastd zmm25, dword ptr [rip+BLAKE3_IV_3]
vmovdqa32 zmm26, zmmword ptr [rsp]
vmovdqa32 zmm27, zmmword ptr [rsp+0x40]
- vpbroadcastd zmm30, dword ptr [0x0000000000000B98]
+ vpbroadcastd zmm30, dword ptr [rip+BLAKE3_BLOCK_LEN]
vpbroadcastd zmm31, dword ptr [rsp+0x80]
mov al, 0x07
4:
@@ -367,6 +409,7 @@ blake3_hash_many_avx512:
vpxord zmm6, zmm6, zmm30
vpxord zmm7, zmm7, zmm31
movzx eax, byte ptr [rbp+0x78]
+ cmp rbx, qword ptr [rsp+0x100]
jb 3b
mov rsi, qword ptr [rbp+0x90]
vpunpckldq zmm8, zmm0, zmm2
@@ -408,12 +451,26 @@ blake3_hash_many_avx512:
vextracti64x4 ymmword ptr [rsi+0xC0], zmm2, 0x00
vextracti64x4 ymmword ptr [rsi+0xE0], zmm3, 0x00
vextracti64x4 ymmword ptr [rsi+0x100], zmm8, 0x01
+ cmp rdx, 0x0A
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x120], zmm10, 0x01
+ cmp rdx, 0x0B
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x140], zmm12, 0x01
+ cmp rdx, 0x0C
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x160], zmm14, 0x01
+ cmp rdx, 0x0D
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x180], zmm0, 0x01
+ cmp rdx, 0x0E
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x1A0], zmm1, 0x01
+ cmp rdx, 0x0F
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x1C0], zmm2, 0x01
+ cmp rdx, 0x10
+ jb 9f
vextracti64x4 ymmword ptr [rsi+0x1E0], zmm3, 0x01
vmovdqa32 zmm8, zmmword ptr [rsp]
vmovdqa32 zmm9, zmmword ptr [rsp+0x40]
@@ -427,22 +484,22 @@ blake3_hash_many_avx512:
mov qword ptr [rbp+0x90], rsi
add rcx, 0x80
sub rdx, 0x10
- cmp rdx, 0x10
- jnb 2b
- test rdx, rdx
+ cmp rdx, 0x08
+ jnbe 2b
+ test edx, edx
jnz 5f
9:
vzeroupper
- movdqa xmm6, xmmword ptr [rbp-0xA8]
- movdqa xmm7, xmmword ptr [rbp-0x98]
- movdqa xmm8, xmmword ptr [rbp-0x88]
- movdqa xmm9, xmmword ptr [rbp-0x78]
- movdqa xmm10, xmmword ptr [rbp-0x68]
- movdqa xmm11, xmmword ptr [rbp-0x58]
- movdqa xmm12, xmmword ptr [rbp-0x48]
- movdqa xmm13, xmmword ptr [rbp-0x38]
- movdqa xmm14, xmmword ptr [rbp-0x28]
- movdqa xmm15, xmmword ptr [rbp-0x18]
+ movaps xmm6, xmmword ptr [rbp-0xA8]
+ movaps xmm7, xmmword ptr [rbp-0x98]
+ movaps xmm8, xmmword ptr [rbp-0x88]
+ movaps xmm9, xmmword ptr [rbp-0x78]
+ movaps xmm10, xmmword ptr [rbp-0x68]
+ movaps xmm11, xmmword ptr [rbp-0x58]
+ movaps xmm12, xmmword ptr [rbp-0x48]
+ movaps xmm13, xmmword ptr [rbp-0x38]
+ movaps xmm14, xmmword ptr [rbp-0x28]
+ movaps xmm15, xmmword ptr [rbp-0x18]
mov rsp, rbp
pop r15
pop r14
@@ -455,9 +512,8 @@ blake3_hash_many_avx512:
ret
.p2align 6
5:
- mov rax, rsp
- test dl, 0x08
- jz 3f
+ cmp dl, 0x04
+ jbe 3f
vpbroadcastd ymm0, dword ptr [r9]
vpbroadcastd ymm1, dword ptr [r9+0x04]
vpbroadcastd ymm2, dword ptr [r9+0x08]
@@ -466,45 +522,50 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [r9+0x14]
vpbroadcastd ymm6, dword ptr [r9+0x18]
vpbroadcastd ymm7, dword ptr [r9+0x1C]
- movzx ebx, byte ptr [rbp+0x78]
- movzx esi, byte ptr [rbp+0x80]
- or ebx, esi
- xor esi, esi
+ movzx eax, byte ptr [rbp+0x78]
+ movzx ebx, byte ptr [rbp+0x80]
+ or eax, ebx
+ xor ebx, ebx
2:
- movzx edi, byte ptr [rbp+0x88]
- or edi, ebx
- add rsi, 0x40
- cmp rsi, qword ptr [rsp+0x100]
- cmovz ebx, edi
- mov dword ptr [rsp+0x80], ebx
- mov ebx, 0xCC
- kmovw k2, ebx
- mov ebx, 0x33
- kmovw k3, ebx
- mov rbx, qword ptr [rcx]
- mov rdi, qword ptr [rcx+0x20]
- vmovups xmm8, xmmword ptr [rbx+rsi*1-0x40]
- vinserti32x4 ymm8, ymm8, xmmword ptr [rdi+rsi*1-0x40], 0x01
- vmovups xmm12, xmmword ptr [rbx+rsi*1-0x30]
- vinserti32x4 ymm12, ymm12, xmmword ptr [rdi+rsi*1-0x30], 0x01
- mov rbx, qword ptr [rcx+0x08]
- mov rdi, qword ptr [rcx+0x28]
- vmovups xmm9, xmmword ptr [rbx+rsi*1-0x40]
- vinserti32x4 ymm9, ymm9, xmmword ptr [rdi+rsi*1-0x40], 0x01
- vmovups xmm13, xmmword ptr [rbx+rsi*1-0x30]
- vinserti32x4 ymm13, ymm13, xmmword ptr [rdi+rsi*1-0x30], 0x01
- mov rbx, qword ptr [rcx+0x10]
- mov rdi, qword ptr [rcx+0x30]
- vmovups xmm10, xmmword ptr [rbx+rsi*1-0x40]
- vinserti32x4 ymm10, ymm10, xmmword ptr [rdi+rsi*1-0x40], 0x01
- vmovups xmm14, xmmword ptr [rbx+rsi*1-0x30]
- vinserti32x4 ymm14, ymm14, xmmword ptr [rdi+rsi*1-0x30], 0x01
- mov rbx, qword ptr [rcx+0x18]
- mov rdi, qword ptr [rcx+0x38]
- vmovups xmm11, xmmword ptr [rbx+rsi*1-0x40]
- vinserti32x4 ymm11, ymm11, xmmword ptr [rdi+rsi*1-0x40], 0x01
- vmovups xmm15, xmmword ptr [rbx+rsi*1-0x30]
- vinserti32x4 ymm15, ymm15, xmmword ptr [rdi+rsi*1-0x30], 0x01
+ movzx esi, byte ptr [rbp+0x88]
+ or esi, eax
+ add rbx, 0x40
+ cmp rbx, qword ptr [rsp+0x100]
+ cmovz eax, esi
+ mov dword ptr [rsp+0x80], eax
+ mov rax, qword ptr [rcx]
+ mov rsi, qword ptr [rcx+0x20]
+ vmovups xmm8, xmmword ptr [rax+rbx*1-0x40]
+ vinserti32x4 ymm8, ymm8, xmmword ptr [rsi+rbx*1-0x40], 0x01
+ vmovups xmm12, xmmword ptr [rax+rbx*1-0x30]
+ vinserti32x4 ymm12, ymm12, xmmword ptr [rsi+rbx*1-0x30], 0x01
+ mov rax, qword ptr [rcx+0x08]
+ vmovups xmm9, xmmword ptr [rax+rbx*1-0x40]
+ vmovups xmm13, xmmword ptr [rax+rbx*1-0x30]
+ cmp dl, 0x06
+ jb 4f
+ mov rsi, qword ptr [rcx+0x28]
+ vinserti32x4 ymm9, ymm9, xmmword ptr [rsi+rbx*1-0x40], 0x01
+ vinserti32x4 ymm13, ymm13, xmmword ptr [rsi+rbx*1-0x30], 0x01
+4:
+ mov rax, qword ptr [rcx+0x10]
+ vmovups xmm10, xmmword ptr [rax+rbx*1-0x40]
+ vmovups xmm14, xmmword ptr [rax+rbx*1-0x30]
+ cmp dl, 0x07
+ jb 4f
+ mov rsi, qword ptr [rcx+0x30]
+ vinserti32x4 ymm10, ymm10, xmmword ptr [rsi+rbx*1-0x40], 0x01
+ vinserti32x4 ymm14, ymm14, xmmword ptr [rsi+rbx*1-0x30], 0x01
+4:
+ mov rax, qword ptr [rcx+0x18]
+ vmovups xmm11, xmmword ptr [rax+rbx*1-0x40]
+ vmovups xmm15, xmmword ptr [rax+rbx*1-0x30]
+ cmp dl, 0x08
+ jb 4f
+ mov rsi, qword ptr [rcx+0x38]
+ vinserti32x4 ymm11, ymm11, xmmword ptr [rsi+rbx*1-0x40], 0x01
+ vinserti32x4 ymm15, ymm15, xmmword ptr [rsi+rbx*1-0x30], 0x01
+4:
vpunpckldq ymm24, ymm8, ymm9
vpunpckhdq ymm9, ymm8, ymm9
vpunpckldq ymm8, ymm10, ymm11
@@ -521,30 +582,39 @@ blake3_hash_many_avx512:
vshufps ymm12, ymm10, ymm12, 0xEE
vshufps ymm10, ymm13, ymm15, 0x44
vshufps ymm15, ymm13, ymm15, 0xEE
- mov rbx, qword ptr [rcx]
- mov rdi, qword ptr [rcx+0x20]
- vmovups xmm16, xmmword ptr [rbx+rsi*1-0x20]
- vinserti32x4 ymm16, ymm16, xmmword ptr [rdi+rsi*1-0x20], 0x01
- vmovups xmm20, xmmword ptr [rbx+rsi*1-0x10]
- vinserti32x4 ymm20, ymm20, xmmword ptr [rdi+rsi*1-0x10], 0x01
- mov rbx, qword ptr [rcx+0x08]
- mov rdi, qword ptr [rcx+0x28]
- vmovups xmm17, xmmword ptr [rbx+rsi*1-0x20]
- vinserti32x4 ymm17, ymm17, xmmword ptr [rdi+rsi*1-0x20], 0x01
- vmovups xmm21, xmmword ptr [rbx+rsi*1-0x10]
- vinserti32x4 ymm21, ymm21, xmmword ptr [rdi+rsi*1-0x10], 0x01
- mov rbx, qword ptr [rcx+0x10]
- mov rdi, qword ptr [rcx+0x30]
- vmovups xmm18, xmmword ptr [rbx+rsi*1-0x20]
- vinserti32x4 ymm18, ymm18, xmmword ptr [rdi+rsi*1-0x20], 0x01
- vmovups xmm22, xmmword ptr [rbx+rsi*1-0x10]
- vinserti32x4 ymm22, ymm22, xmmword ptr [rdi+rsi*1-0x10], 0x01
- mov rbx, qword ptr [rcx+0x18]
- mov rdi, qword ptr [rcx+0x38]
- vmovups xmm19, xmmword ptr [rbx+rsi*1-0x20]
- vinserti32x4 ymm19, ymm19, xmmword ptr [rdi+rsi*1-0x20], 0x01
- vmovups xmm23, xmmword ptr [rbx+rsi*1-0x10]
- vinserti32x4 ymm23, ymm23, xmmword ptr [rdi+rsi*1-0x10], 0x01
+ mov rax, qword ptr [rcx]
+ mov rsi, qword ptr [rcx+0x20]
+ vmovups xmm16, xmmword ptr [rax+rbx*1-0x20]
+ vinserti32x4 ymm16, ymm16, xmmword ptr [rsi+rbx*1-0x20], 0x01
+ vmovups xmm20, xmmword ptr [rax+rbx*1-0x10]
+ vinserti32x4 ymm20, ymm20, xmmword ptr [rsi+rbx*1-0x10], 0x01
+ mov rax, qword ptr [rcx+0x08]
+ vmovups xmm17, xmmword ptr [rax+rbx*1-0x20]
+ vmovups xmm21, xmmword ptr [rax+rbx*1-0x10]
+ cmp dl, 0x06
+ jb 4f
+ mov rsi, qword ptr [rcx+0x28]
+ vinserti32x4 ymm17, ymm17, xmmword ptr [rsi+rbx*1-0x20], 0x01
+ vinserti32x4 ymm21, ymm21, xmmword ptr [rsi+rbx*1-0x10], 0x01
+4:
+ mov rax, qword ptr [rcx+0x10]
+ vmovups xmm18, xmmword ptr [rax+rbx*1-0x20]
+ vmovups xmm22, xmmword ptr [rax+rbx*1-0x10]
+ cmp dl, 0x07
+ jb 4f
+ mov rsi, qword ptr [rcx+0x30]
+ vinserti32x4 ymm18, ymm18, xmmword ptr [rsi+rbx*1-0x20], 0x01
+ vinserti32x4 ymm22, ymm22, xmmword ptr [rsi+rbx*1-0x10], 0x01
+4:
+ mov rax, qword ptr [rcx+0x18]
+ vmovups xmm19, xmmword ptr [rax+rbx*1-0x20]
+ vmovups xmm23, xmmword ptr [rax+rbx*1-0x10]
+ cmp dl, 0x08
+ jb 4f
+ mov rsi, qword ptr [rcx+0x38]
+ vinserti32x4 ymm19, ymm19, xmmword ptr [rsi+rbx*1-0x20], 0x01
+ vinserti32x4 ymm23, ymm23, xmmword ptr [rsi+rbx*1-0x10], 0x01
+4:
vpunpckldq ymm13, ymm16, ymm17
vpunpckhdq ymm17, ymm16, ymm17
vpunpckldq ymm16, ymm18, ymm19
@@ -565,11 +635,11 @@ blake3_hash_many_avx512:
vpbroadcastd ymm25, dword ptr [rip+BLAKE3_IV_1]
vpbroadcastd ymm26, dword ptr [rip+BLAKE3_IV_2]
vpbroadcastd ymm27, dword ptr [rip+BLAKE3_IV_3]
- vmovdqa32 ymm28, ymmword ptr [rax]
- vmovdqa32 ymm29, ymmword ptr [rax+0x40]
+ vmovdqa32 ymm28, ymmword ptr [rsp]
+ vmovdqa32 ymm29, ymmword ptr [rsp+0x40]
vpbroadcastd ymm30, dword ptr [rip+BLAKE3_BLOCK_LEN]
vpbroadcastd ymm31, dword ptr [rsp+0x80]
- mov bl, 0x07
+ mov al, 0x07
4:
vpaddd ymm0, ymm0, ymm14
vpaddd ymm1, ymm1, ymm24
@@ -701,7 +771,7 @@ blake3_hash_many_avx512:
vprord ymm7, ymm7, 0x07
vprord ymm4, ymm4, 0x07
vmovdqa32 ymm8, ymmword ptr [rsp+0xC0]
- dec bl
+ dec al
jnz 4b
vpxord ymm0, ymm0, ymm21
vpxord ymm1, ymm1, ymm25
@@ -711,78 +781,85 @@ blake3_hash_many_avx512:
vpxord ymm5, ymm5, ymm29
vpxord ymm6, ymm6, ymm30
vpxord ymm7, ymm7, ymm31
- movzx ebx, byte ptr [rbp+0x78]
+ movzx eax, byte ptr [rbp+0x78]
+ cmp rbx, qword ptr [rsp+0x100]
jb 2b
- mov rdi, qword ptr [rbp+0x90]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
+ mov rsi, qword ptr [rbp+0x90]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckldq ymm9, ymm2, ymm3
+ vpunpckhdq ymm10, ymm0, ymm1
+ vpunpckldq ymm11, ymm4, ymm5
+ vpunpckldq ymm0, ymm6, ymm7
vshufps ymm12, ymm8, ymm9, 0x4E
- vblendps ymm1, ymm8, ymm12, 0xCC
+ vpblendd ymm1, ymm8, ymm12, 0xCC
vshufps ymm8, ymm11, ymm0, 0x4E
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0xCC
- vblendps ymm3, ymm12, ymm9, 0xCC
- vperm2f128 ymm12, ymm1, ymm2, 0x20
- vmovups ymmword ptr [rdi], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0xCC
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 0x20
- vmovups ymmword ptr [rdi+0x20], ymm7
+ vpunpckhdq ymm13, ymm2, ymm3
+ vpblendd ymm2, ymm11, ymm8, 0xCC
+ vpblendd ymm3, ymm12, ymm9, 0xCC
+ vperm2i128 ymm12, ymm1, ymm2, 0x20
+ vmovdqu ymmword ptr [rsi], ymm12
+ vpunpckhdq ymm14, ymm4, ymm5
+ vpblendd ymm4, ymm8, ymm0, 0xCC
+ vpunpckhdq ymm15, ymm6, ymm7
+ vperm2i128 ymm7, ymm3, ymm4, 0x20
+ vmovdqu ymmword ptr [rsi+0x20], ymm7
vshufps ymm5, ymm10, ymm13, 0x4E
- vblendps ymm6, ymm5, ymm13, 0xCC
+ vpblendd ymm6, ymm5, ymm13, 0xCC
vshufps ymm13, ymm14, ymm15, 0x4E
- vblendps ymm10, ymm10, ymm5, 0xCC
- vblendps ymm14, ymm14, ymm13, 0xCC
- vperm2f128 ymm8, ymm10, ymm14, 0x20
- vmovups ymmword ptr [rdi+0x40], ymm8
- vblendps ymm15, ymm13, ymm15, 0xCC
- vperm2f128 ymm13, ymm6, ymm15, 0x20
- vmovups ymmword ptr [rdi+0x60], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 0x31
- vperm2f128 ymm11, ymm3, ymm4, 0x31
- vmovups ymmword ptr [rdi+0x80], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 0x31
- vperm2f128 ymm15, ymm6, ymm15, 0x31
- vmovups ymmword ptr [rdi+0xA0], ymm11
- vmovups ymmword ptr [rdi+0xC0], ymm14
- vmovups ymmword ptr [rdi+0xE0], ymm15
- lea r8, qword ptr [rax+0x20]
- kortestw k1, k1
- cmovnz rax, r8
- add rdi, 0x100
- mov qword ptr [rbp+0x90], rdi
- add rcx, 0x40
+ vpblendd ymm10, ymm10, ymm5, 0xCC
+ vpblendd ymm14, ymm14, ymm13, 0xCC
+ vperm2i128 ymm8, ymm10, ymm14, 0x20
+ vmovdqu ymmword ptr [rsi+0x40], ymm8
+ vpblendd ymm15, ymm13, ymm15, 0xCC
+ vperm2i128 ymm13, ymm6, ymm15, 0x20
+ vmovdqu ymmword ptr [rsi+0x60], ymm13
+ vperm2i128 ymm9, ymm1, ymm2, 0x31
+ vmovdqu ymmword ptr [rsi+0x80], ymm9
+ cmp dl, 0x06
+ jb 4f
+ vperm2i128 ymm11, ymm3, ymm4, 0x31
+ vmovdqu ymmword ptr [rsi+0xA0], ymm11
+ cmp dl, 0x07
+ jb 4f
+ vperm2i128 ymm14, ymm10, ymm14, 0x31
+ vmovdqu ymmword ptr [rsi+0xC0], ymm14
+ cmp dl, 0x08
+ jb 4f
+ vperm2i128 ymm15, ymm6, ymm15, 0x31
+ vmovdqu ymmword ptr [rsi+0xE0], ymm15
+4:
+ jmp 9b
+3:
+ mov rax, qword ptr [rsp+0x100]
mov rbx, qword ptr [rbp+0x90]
movzx esi, byte ptr [rbp+0x78]
movzx edi, byte ptr [rbp+0x88]
- test dl, 0x04
- jz 3f
+ mov r8d, 0xAAAA
+ kmovw k2, r8d
+ mov r8d, 0x8888
+ kmovw k3, r8d
+ cmp dl, 0x02
+ jbe 3f
vbroadcasti32x4 zmm0, xmmword ptr [r9]
vbroadcasti32x4 zmm1, xmmword ptr [r9+0x10]
vbroadcasti32x4 zmm4, xmmword ptr [rip+BLAKE3_IV]
mov r8d, 0x4444
- kmovw k2, r8d
- vmovdqa xmm6, xmmword ptr [rax]
- vmovdqa xmm7, xmmword ptr [rax+0x40]
+ kmovw k4, r8d
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+0x40]
+ vpbroadcastd zmm5, dword ptr [rip+BLAKE3_BLOCK_LEN]
vpunpckldq xmm8, xmm6, xmm7
- vpunpckhdq xmm9, xmm6, xmm7
- vpermq ymm8, ymm8, 0xDC
- vpermq ymm9, ymm9, 0xDC
- vpbroadcastd zmm6, dword ptr [rip+BLAKE3_BLOCK_LEN]
- vinserti64x4 zmm5, zmm8, ymm9, 0x01
- vpblendmd zmm5 {k2}, zmm5, zmm6
+ vpunpckhdq xmm7, xmm6, xmm7
+ vinserti64x4 zmm8, zmm8, ymm7, 0x01
+ vpermq zmm8, zmm8, 0xDC
+ vpblendmd zmm5 {k4}, zmm8, zmm5
mov r8, qword ptr [rcx]
mov r10, qword ptr [rcx+0x08]
mov r11, qword ptr [rcx+0x10]
+ cmp dl, 0x04
+ jb 4f
mov r12, qword ptr [rcx+0x18]
- mov r13d, 0xAAAA
- kmovw k2, r13d
- mov r13d, 0x8888
- kmovw k3, r13d
+4:
movzx r13d, byte ptr [rbp+0x80]
or r13d, esi
xor r14d, r14d
@@ -790,32 +867,34 @@ blake3_hash_many_avx512:
movzx r15d, byte ptr [rbp+0x88]
or r15d, r13d
add r14, 0x40
- cmp r14, qword ptr [rsp+0x100]
+ cmp r14, rax
cmovz r13d, r15d
mov dword ptr [rsp+0x80], r13d
vmovdqa32 zmm2, zmm4
- vpbroadcastd zmm6, dword ptr [rsp+0x80]
- vpblendmd zmm3 {k3}, zmm5, zmm6
+ vpblendmd zmm3 {k3}, zmm5, dword ptr [rsp+0x80] {1to16}
vmovdqu32 zmm10, zmmword ptr [r8+r14*1-0x40]
- vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x40], 0x01
- vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x40], 0x02
- vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x40], 0x03
vmovdqu32 zmm11, zmmword ptr [r8+r14*1-0x30]
+ vmovdqu32 zmm12, zmmword ptr [r8+r14*1-0x20]
+ vmovdqu32 zmm13, zmmword ptr [r8+r14*1-0x10]
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x40], 0x01
vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-0x30], 0x01
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r10+r14*1-0x20], 0x01
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r10+r14*1-0x10], 0x01
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x40], 0x02
vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-0x30], 0x02
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r11+r14*1-0x20], 0x02
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r11+r14*1-0x10], 0x02
+ cmp dl, 0x04
+ jb 4f
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x40], 0x03
vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-0x30], 0x03
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r12+r14*1-0x20], 0x03
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r12+r14*1-0x10], 0x03
+4:
vshufps zmm6, zmm10, zmm11, 0x88
vshufps zmm7, zmm10, zmm11, 0xDD
- vmovdqu32 zmm10, zmmword ptr [r8+r14*1-0x20]
- vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x20], 0x01
- vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x20], 0x02
- vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x20], 0x03
- vmovdqu32 zmm11, zmmword ptr [r8+r14*1-0x10]
- vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-0x10], 0x01
- vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-0x10], 0x02
- vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-0x10], 0x03
- vshufps zmm8, zmm10, zmm11, 0x88
- vshufps zmm9, zmm10, zmm11, 0xDD
+ vshufps zmm8, zmm12, zmm13, 0x88
+ vshufps zmm9, zmm12, zmm13, 0xDD
vpshufd zmm8, zmm8, 0x93
vpshufd zmm9, zmm9, 0x93
mov r15b, 0x07
@@ -856,24 +935,25 @@ blake3_hash_many_avx512:
vpshufd zmm2, zmm2, 0x93
dec r15b
jz 4f
- vshufps zmm12, zmm6, zmm7, 0xD6
- vpshufd zmm13, zmm6, 0x0F
- vpshufd zmm6, zmm12, 0x39
- vshufps zmm12, zmm8, zmm9, 0xFA
- vpblendmd zmm13 {k2}, zmm13, zmm12
- vpunpcklqdq zmm12, zmm9, zmm7
- vpblendmd zmm12 {k3}, zmm12, zmm8
- vpshufd zmm12, zmm12, 0x78
+ vshufps zmm14, zmm6, zmm7, 0xD6
+ vpshufd zmm15, zmm6, 0x0F
+ vpshufd zmm6, zmm14, 0x39
+ vshufps zmm14, zmm8, zmm9, 0xFA
+ vpblendmd zmm15 {k2}, zmm15, zmm14
+ vpunpcklqdq zmm14, zmm9, zmm7
+ vpblendmd zmm14 {k3}, zmm14, zmm8
+ vpshufd zmm14, zmm14, 0x78
vpunpckhdq zmm7, zmm7, zmm9
vpunpckldq zmm8, zmm8, zmm7
vpshufd zmm9, zmm8, 0x1E
- vmovdqa32 zmm7, zmm13
- vmovdqa32 zmm8, zmm12
+ vmovdqa32 zmm7, zmm15
+ vmovdqa32 zmm8, zmm14
jmp 4b
4:
vpxord zmm0, zmm0, zmm2
vpxord zmm1, zmm1, zmm3
mov r13d, esi
+ cmp r14, rax
jb 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
@@ -881,27 +961,33 @@ blake3_hash_many_avx512:
vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
vextracti32x4 xmmword ptr [rbx+0x40], zmm0, 0x02
vextracti32x4 xmmword ptr [rbx+0x50], zmm1, 0x02
+ cmp dl, 0x04
+ jb 4f
vextracti32x4 xmmword ptr [rbx+0x60], zmm0, 0x03
vextracti32x4 xmmword ptr [rbx+0x70], zmm1, 0x03
- lea r15, qword ptr [rax+0x10]
- kortestw k1, k1
- cmovnz rax, r15
- add rbx, 0x80
- add rcx, 0x20
- test dl, 0x02
- jz 3f
+4:
+ jmp 9b
+3:
+ test dl, dl
+ jz 9b
vbroadcasti128 ymm0, xmmword ptr [r9]
vbroadcasti128 ymm1, xmmword ptr [r9+0x10]
vbroadcasti128 ymm4, xmmword ptr [rip+BLAKE3_IV]
- vmovd xmm5, dword ptr [rax]
- vpinsrd xmm5, xmm5, dword ptr [rax+0x40], 0x01
- vpinsrd xmm5, xmm5, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02
- vmovd xmm6, dword ptr [rax+0x04]
- vpinsrd xmm6, xmm6, dword ptr [rax+0x44], 0x01
- vpinsrd xmm6, xmm6, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02
- vinserti128 ymm5, ymm5, xmm6, 0x01
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+0x40]
+ mov r8d, 0x40
+ vpbroadcastq ymm5, r8
+ mov r8d, 0x55
+ kmovw k4, r8d
+ vpunpckldq xmm8, xmm6, xmm7
+ vpunpckhdq xmm7, xmm6, xmm7
+ vinserti128 ymm8, ymm8, xmm7, 0x01
+ vpermq ymm5 {k4}, ymm8, 0xDC
mov r8, qword ptr [rcx]
+ cmp dl, 0x02
+ jb 4f
mov r10, qword ptr [rcx+0x08]
+4:
mov r11d, esi
movzx r12d, byte ptr [rbp+0x80]
or r11d, r12d
@@ -910,24 +996,26 @@ blake3_hash_many_avx512:
movzx r13d, byte ptr [rbp+0x88]
or r13d, r11d
add r12, 0x40
- cmp r12, qword ptr [rsp+0x100]
+ cmp r12, rax
cmovz r11d, r13d
mov dword ptr [rsp+0x80], r11d
vmovdqa ymm2, ymm4
- vpbroadcastd ymm6, dword ptr [rsp+0x80]
- vpblendd ymm3, ymm5, ymm6, 0x88
+ vpblendmd ymm3 {k3}, ymm5, dword ptr [rsp+0x80] {1to8}
vmovdqu ymm10, ymmword ptr [r8+r12*1-0x40]
- vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x40], 0x01
vmovdqu ymm11, ymmword ptr [r8+r12*1-0x30]
+ vmovdqu ymm12, ymmword ptr [r8+r12*1-0x20]
+ vmovdqu ymm13, ymmword ptr [r8+r12*1-0x10]
+ cmp dl, 0x02
+ jb 4f
+ vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x40], 0x01
vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-0x30], 0x01
+ vinserti128 ymm12, ymm12, xmmword ptr [r10+r12*1-0x20], 0x01
+ vinserti128 ymm13, ymm13, xmmword ptr [r10+r12*1-0x10], 0x01
+4:
vshufps ymm6, ymm10, ymm11, 0x88
vshufps ymm7, ymm10, ymm11, 0xDD
- vmovdqu ymm10, ymmword ptr [r8+r12*1-0x20]
- vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x20], 0x01
- vmovdqu ymm11, ymmword ptr [r8+r12*1-0x10]
- vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-0x10], 0x01
- vshufps ymm8, ymm10, ymm11, 0x88
- vshufps ymm9, ymm10, ymm11, 0xDD
+ vshufps ymm8, ymm12, ymm13, 0x88
+ vshufps ymm9, ymm12, ymm13, 0xDD
vpshufd ymm8, ymm8, 0x93
vpshufd ymm9, ymm9, 0x93
mov r13b, 0x07
@@ -986,109 +1074,17 @@ blake3_hash_many_avx512:
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
mov r11d, esi
+ cmp r12, rax
jb 2b
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+0x10], xmm1
+ cmp dl, 0x02
+ jb 4f
vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
- lea r13, qword ptr [rax+0x08]
- kortestw k1, k1
- cmovnz rax, r13
- add rbx, 0x40
- add rcx, 0x10
- test dl, 0x01
- jz 9b
- vmovdqu xmm0, xmmword ptr [r9]
- vmovdqu xmm1, xmmword ptr [r9+0x10]
- vmovdqa xmm4, xmmword ptr [rip+BLAKE3_IV]
- vmovd xmm5, dword ptr [rax]
- vpinsrd xmm5, xmm5, dword ptr [rax+0x40], 0x01
- vpinsrd xmm5, xmm5, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02
- mov r8, qword ptr [rcx]
- mov r10d, esi
- movzx r11d, byte ptr [rbp+0x80]
- or r10d, r11d
- xor r11d, r11d
-2:
- movzx r12d, byte ptr [rbp+0x88]
- or r12d, r10d
- add r11, 0x40
- cmp r11, qword ptr [rsp+0x100]
- cmovz r10d, r12d
- vmovdqa xmm2, xmm4
- vpinsrd xmm3, xmm5, r10d, 0x03
- vmovdqu xmm10, xmmword ptr [r8+r11*1-0x40]
- vmovdqu xmm11, xmmword ptr [r8+r11*1-0x30]
- vshufps xmm6, xmm10, xmm11, 0x88
- vshufps xmm7, xmm10, xmm11, 0xDD
- vmovdqu xmm10, xmmword ptr [r8+r11*1-0x20]
- vmovdqu xmm11, xmmword ptr [r8+r11*1-0x10]
- vshufps xmm8, xmm10, xmm11, 0x88
- vshufps xmm9, xmm10, xmm11, 0xDD
- vpshufd xmm8, xmm8, 0x93
- vpshufd xmm9, xmm9, 0x93
- mov r12b, 0x07
4:
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x10
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x0C
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x08
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x07
- vpshufd xmm0, xmm0, 0x93
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x39
- vpaddd xmm0, xmm0, xmm8
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x10
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x0C
- vpaddd xmm0, xmm0, xmm9
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 0x08
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0x07
- vpshufd xmm0, xmm0, 0x39
- vpshufd xmm3, xmm3, 0x4E
- vpshufd xmm2, xmm2, 0x93
- dec r12b
- jz 4f
- vshufps xmm10, xmm6, xmm7, 0xD6
- vpshufd xmm11, xmm6, 0x0F
- vpshufd xmm6, xmm10, 0x39
- vshufps xmm10, xmm8, xmm9, 0xFA
- vpblendd xmm11, xmm11, xmm10, 0xAA
- vpunpcklqdq xmm10, xmm9, xmm7
- vpblendd xmm10, xmm10, xmm8, 0x88
- vpshufd xmm10, xmm10, 0x78
- vpunpckhdq xmm7, xmm7, xmm9
- vpunpckldq xmm8, xmm8, xmm7
- vpshufd xmm9, xmm8, 0x1E
- vmovdqa xmm7, xmm11
- vmovdqa xmm8, xmm10
- jmp 4b
-4:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov r10d, esi
- jb 2b
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+0x10], xmm1
jmp 9b
-
.p2align 6
_blake3_compress_in_place_avx512:
blake3_compress_in_place_avx512:
@@ -1286,10 +1282,10 @@ _blake3_xof_many_avx512:
cmp rax, 0x01
jnbe 2f
sub rsp, 0x48
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+0x10], xmm7
- movdqa xmmword ptr [rsp+0x20], xmm8
- movdqa xmmword ptr [rsp+0x30], xmm9
+ movaps xmmword ptr [rsp], xmm6
+ movaps xmmword ptr [rsp+0x10], xmm7
+ movaps xmmword ptr [rsp+0x20], xmm8
+ movaps xmmword ptr [rsp+0x30], xmm9
vmovdqu xmm0, xmmword ptr [rcx]
vmovdqu xmm1, xmmword ptr [rcx+0x10]
movzx r8d, r8b
@@ -1373,26 +1369,26 @@ _blake3_xof_many_avx512:
vmovdqu xmmword ptr [r8+0x20], xmm2
vmovdqu xmmword ptr [r8+0x30], xmm3
vzeroupper
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+0x10]
- movdqa xmm8, xmmword ptr [rsp+0x20]
- movdqa xmm9, xmmword ptr [rsp+0x30]
+ movaps xmm6, xmmword ptr [rsp]
+ movaps xmm7, xmmword ptr [rsp+0x10]
+ movaps xmm8, xmmword ptr [rsp+0x20]
+ movaps xmm9, xmmword ptr [rsp+0x30]
add rsp, 0x48
ret
2:
push rbp
mov rbp, rsp
sub rsp, 0x1A0
- movdqa xmmword ptr [rbp-0xA0], xmm6
- movdqa xmmword ptr [rbp-0x90], xmm7
- movdqa xmmword ptr [rbp-0x80], xmm8
- movdqa xmmword ptr [rbp-0x70], xmm9
- movdqa xmmword ptr [rbp-0x60], xmm10
- movdqa xmmword ptr [rbp-0x50], xmm11
- movdqa xmmword ptr [rbp-0x40], xmm12
- movdqa xmmword ptr [rbp-0x30], xmm13
- movdqa xmmword ptr [rbp-0x20], xmm14
- movdqa xmmword ptr [rbp-0x10], xmm15
+ movaps xmmword ptr [rbp-0xA0], xmm6
+ movaps xmmword ptr [rbp-0x90], xmm7
+ movaps xmmword ptr [rbp-0x80], xmm8
+ movaps xmmword ptr [rbp-0x70], xmm9
+ movaps xmmword ptr [rbp-0x60], xmm10
+ movaps xmmword ptr [rbp-0x50], xmm11
+ movaps xmmword ptr [rbp-0x40], xmm12
+ movaps xmmword ptr [rbp-0x30], xmm13
+ movaps xmmword ptr [rbp-0x20], xmm14
+ movaps xmmword ptr [rbp-0x10], xmm15
and rsp, 0xFFFFFFFFFFFFFFC0
vpbroadcastd zmm0, r9d
shr r9, 0x20
@@ -1704,23 +1700,23 @@ _blake3_xof_many_avx512:
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x40], zmm1
add r9, 0x400
- cmp rax, 0x18
- lea rax, qword ptr [rax-0x10]
+ sub rax, 0x10
+ cmp rax, 0x08
jnbe 3b
test al, al
jnz 2f
9:
vzeroupper
- movdqa xmm6, xmmword ptr [rbp-0xA0]
- movdqa xmm7, xmmword ptr [rbp-0x90]
- movdqa xmm8, xmmword ptr [rbp-0x80]
- movdqa xmm9, xmmword ptr [rbp-0x70]
- movdqa xmm10, xmmword ptr [rbp-0x60]
- movdqa xmm11, xmmword ptr [rbp-0x50]
- movdqa xmm12, xmmword ptr [rbp-0x40]
- movdqa xmm13, xmmword ptr [rbp-0x30]
- movdqa xmm14, xmmword ptr [rbp-0x20]
- movdqa xmm15, xmmword ptr [rbp-0x10]
+ movaps xmm6, xmmword ptr [rbp-0xA0]
+ movaps xmm7, xmmword ptr [rbp-0x90]
+ movaps xmm8, xmmword ptr [rbp-0x80]
+ movaps xmm9, xmmword ptr [rbp-0x70]
+ movaps xmm10, xmmword ptr [rbp-0x60]
+ movaps xmm11, xmmword ptr [rbp-0x50]
+ movaps xmm12, xmmword ptr [rbp-0x40]
+ movaps xmm13, xmmword ptr [rbp-0x30]
+ movaps xmm14, xmmword ptr [rbp-0x20]
+ movaps xmm15, xmmword ptr [rbp-0x10]
mov rsp, rbp
pop rbp
ret
diff --git a/c/blake3_avx512_x86-64_windows_msvc.asm b/c/blake3_avx512_x86-64_windows_msvc.asm
index caa772c..be27340 100644
--- a/c/blake3_avx512_x86-64_windows_msvc.asm
+++ b/c/blake3_avx512_x86-64_windows_msvc.asm
@@ -22,16 +22,16 @@ _blake3_hash_many_avx512 PROC
push r15
mov rbp, rsp
sub rsp, 1E8h
- movdqa xmmword ptr [rbp-0A8h], xmm6
- movdqa xmmword ptr [rbp-98h], xmm7
- movdqa xmmword ptr [rbp-88h], xmm8
- movdqa xmmword ptr [rbp-78h], xmm9
- movdqa xmmword ptr [rbp-68h], xmm10
- movdqa xmmword ptr [rbp-58h], xmm11
- movdqa xmmword ptr [rbp-48h], xmm12
- movdqa xmmword ptr [rbp-38h], xmm13
- movdqa xmmword ptr [rbp-28h], xmm14
- movdqa xmmword ptr [rbp-18h], xmm15
+ movaps xmmword ptr [rbp-0A8h], xmm6
+ movaps xmmword ptr [rbp-98h], xmm7
+ movaps xmmword ptr [rbp-88h], xmm8
+ movaps xmmword ptr [rbp-78h], xmm9
+ movaps xmmword ptr [rbp-68h], xmm10
+ movaps xmmword ptr [rbp-58h], xmm11
+ movaps xmmword ptr [rbp-48h], xmm12
+ movaps xmmword ptr [rbp-38h], xmm13
+ movaps xmmword ptr [rbp-28h], xmm14
+ movaps xmmword ptr [rbp-18h], xmm15
and rsp, -40h
mov rax, qword ptr [rbp+68h]
movzx ebx, byte ptr [rbp+70h]
@@ -40,7 +40,7 @@ _blake3_hash_many_avx512 PROC
vpbroadcastd ymm0, eax
shr rax, 20h
vpbroadcastd ymm1, eax
- vmovdqa32 ymm2 {k1} {z}, ymmword ptr [ADD0]
+ vmovdqa32 ymm2 {k1} {z}, ymmword ptr [ADD0+0]
vmovdqa32 ymm3 {k1} {z}, ymmword ptr [ADD0+32]
vpaddd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp], ymm2
@@ -55,9 +55,9 @@ _blake3_hash_many_avx512 PROC
vmovdqa ymmword ptr [rsp+60h], ymm1
shl r8, 6h
mov qword ptr [rsp+100h], r8
- cmp rdx, 10h
- jb final15blocks
-ALIGN 16
+ cmp rdx, 8h
+ jbe final8blocks
+ALIGN 16
outerloop16:
vpbroadcastd zmm0, dword ptr [r9]
vpbroadcastd zmm1, dword ptr [r9+4h]
@@ -83,39 +83,60 @@ innerloop16:
mov rdi, qword ptr [rcx+10h]
mov r8, qword ptr [rcx+18h]
mov r10, qword ptr [rcx+40h]
- mov r11, qword ptr [rcx+48h]
- mov r12, qword ptr [rcx+50h]
- mov r13, qword ptr [rcx+58h]
vmovdqu32 ymm8, ymmword ptr [rax+rbx*1-40h]
vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rbx*1-40h], 1h
vmovdqu32 ymm9, ymmword ptr [rsi+rbx*1-40h]
+ cmp rdx, 0Ah
+ jb @F
+ mov r11, qword ptr [rcx+48h]
vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rbx*1-40h], 1h
+@@:
vpunpckldq zmm10, zmm8, zmm9
vpunpckhdq zmm11, zmm8, zmm9
vmovdqu32 ymm8, ymmword ptr [rdi+rbx*1-40h]
+ cmp rdx, 0Bh
+ jb @F
+ mov r12, qword ptr [rcx+50h]
vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rbx*1-40h], 1h
+@@:
vmovdqu32 ymm9, ymmword ptr [r8+rbx*1-40h]
+ cmp rdx, 0Ch
+ jb @F
+ mov r13, qword ptr [rcx+58h]
vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rbx*1-40h], 1h
+@@:
vpunpckldq zmm12, zmm8, zmm9
vpunpckhdq zmm13, zmm8, zmm9
mov rax, qword ptr [rcx+20h]
mov rsi, qword ptr [rcx+28h]
mov rdi, qword ptr [rcx+30h]
mov r8, qword ptr [rcx+38h]
- mov r10, qword ptr [rcx+60h]
- mov r11, qword ptr [rcx+68h]
- mov r12, qword ptr [rcx+70h]
- mov r13, qword ptr [rcx+78h]
vmovdqu32 ymm8, ymmword ptr [rax+rbx*1-40h]
+ cmp rdx, 0Dh
+ jb @F
+ mov r10, qword ptr [rcx+60h]
vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rbx*1-40h], 1h
+@@:
vmovdqu32 ymm9, ymmword ptr [rsi+rbx*1-40h]
+ cmp rdx, 0Eh
+ jb @F
+ mov r11, qword ptr [rcx+68h]
vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rbx*1-40h], 1h
+@@:
vpunpckldq zmm14, zmm8, zmm9
vpunpckhdq zmm15, zmm8, zmm9
vmovdqu32 ymm8, ymmword ptr [rdi+rbx*1-40h]
+ cmp rdx, 0Fh
+ jb @F
+ mov r12, qword ptr [rcx+70h]
vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rbx*1-40h], 1h
+@@:
vmovdqu32 ymm9, ymmword ptr [r8+rbx*1-40h]
+ cmp rdx, 10h
+ jb @F
+ mov r13, qword ptr [rcx+78h]
vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rbx*1-40h], 1h
+@@:
vpunpckldq zmm16, zmm8, zmm9
vpunpckhdq zmm17, zmm8, zmm9
vmovdqa32 zmm8, zmmword ptr [INDEX0]
@@ -145,19 +166,31 @@ innerloop16:
mov rdi, qword ptr [rcx+10h]
mov r8, qword ptr [rcx+18h]
mov r10, qword ptr [rcx+40h]
- mov r11, qword ptr [rcx+48h]
- mov r12, qword ptr [rcx+50h]
- mov r13, qword ptr [rcx+58h]
vmovdqu32 ymm11, ymmword ptr [rax+rbx*1-20h]
vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rbx*1-20h], 1h
vmovdqu32 ymm13, ymmword ptr [rsi+rbx*1-20h]
+ cmp rdx, 0Ah
+ jb @F
+ mov r11, qword ptr [rcx+48h]
vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r11+rbx*1+80h]
+@@:
vpunpckldq zmm15, zmm11, zmm13
vpunpckhdq zmm17, zmm11, zmm13
vmovdqu32 ymm11, ymmword ptr [rdi+rbx*1-20h]
+ cmp rdx, 0Bh
+ jb @F
+ mov r12, qword ptr [rcx+50h]
vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r13+rbx*1+80h]
+@@:
vmovdqu32 ymm13, ymmword ptr [r8+rbx*1-20h]
+ cmp rdx, 0Ch
+ jb @F
+ mov r13, qword ptr [rcx+58h]
vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r13+rbx*1+80h]
+@@:
vpunpckldq zmm22, zmm11, zmm13
vpunpckhdq zmm23, zmm11, zmm13
prefetcht0 byte ptr [rax+rbx*1+80h]
@@ -165,33 +198,42 @@ innerloop16:
prefetcht0 byte ptr [rdi+rbx*1+80h]
prefetcht0 byte ptr [r8+rbx*1+80h]
prefetcht0 byte ptr [r10+rbx*1+80h]
- prefetcht0 byte ptr [r11+rbx*1+80h]
- prefetcht0 byte ptr [r12+rbx*1+80h]
- prefetcht0 byte ptr [r13+rbx*1+80h]
mov rax, qword ptr [rcx+20h]
mov rsi, qword ptr [rcx+28h]
mov rdi, qword ptr [rcx+30h]
mov r8, qword ptr [rcx+38h]
- mov r10, qword ptr [rcx+60h]
- mov r11, qword ptr [rcx+68h]
- mov r12, qword ptr [rcx+70h]
- mov r13, qword ptr [rcx+78h]
vmovdqu32 ymm11, ymmword ptr [rax+rbx*1-20h]
+ cmp rdx, 0Dh
+ jb @F
+ mov r10, qword ptr [rcx+60h]
vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r10+rbx*1+80h]
+@@:
vmovdqu32 ymm13, ymmword ptr [rsi+rbx*1-20h]
+ cmp rdx, 0Eh
+ jb @F
+ mov r11, qword ptr [rcx+68h]
vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r11+rbx*1+80h]
+@@:
vpunpckldq zmm24, zmm11, zmm13
vpunpckhdq zmm25, zmm11, zmm13
vmovdqu32 ymm11, ymmword ptr [rdi+rbx*1-20h]
+ cmp rdx, 0Fh
+ jb @F
+ mov r12, qword ptr [rcx+70h]
vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r12+rbx*1+80h]
+@@:
vmovdqu32 ymm13, ymmword ptr [r8+rbx*1-20h]
+ cmp rdx, 10h
+ jb @F
+ mov r13, qword ptr [rcx+78h]
vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rbx*1-20h], 1h
+ prefetcht0 byte ptr [r13+rbx*1+80h]
+@@:
vpunpckldq zmm26, zmm11, zmm13
vpunpckhdq zmm27, zmm11, zmm13
- prefetcht0 byte ptr [rax+rbx*1+80h]
- prefetcht0 byte ptr [rsi+rbx*1+80h]
- prefetcht0 byte ptr [rdi+rbx*1+80h]
- prefetcht0 byte ptr [r8+rbx*1+80h]
prefetcht0 byte ptr [r10+rbx*1+80h]
prefetcht0 byte ptr [r11+rbx*1+80h]
prefetcht0 byte ptr [r12+rbx*1+80h]
@@ -366,6 +408,7 @@ innerloop16:
vpxord zmm6, zmm6, zmm30
vpxord zmm7, zmm7, zmm31
movzx eax, byte ptr [rbp+78h]
+ cmp rbx, qword ptr [rsp+100h]
jb innerloop16
mov rsi, qword ptr [rbp+90h]
vpunpckldq zmm8, zmm0, zmm2
@@ -384,8 +427,8 @@ innerloop16:
vpunpckhdq zmm5, zmm12, zmm14
vpunpckldq zmm6, zmm13, zmm15
vpunpckhdq zmm7, zmm13, zmm15
- vmovdqa32 zmm16, zmmword ptr [$+1BDh]
- vmovdqa32 zmm18, zmmword ptr [$+1F3h]
+ vmovdqa32 zmm16, zmmword ptr [INDEX0]
+ vmovdqa32 zmm18, zmmword ptr [INDEX1]
vmovdqa32 zmm8, zmm0
vpermt2d zmm8, zmm16, zmm4
vpermt2d zmm0, zmm18, zmm4
@@ -407,12 +450,26 @@ innerloop16:
vextracti64x4 ymmword ptr [rsi+0C0h], zmm2, 0h
vextracti64x4 ymmword ptr [rsi+0E0h], zmm3, 0h
vextracti64x4 ymmword ptr [rsi+100h], zmm8, 1h
+ cmp rdx, 0Ah
+ jb unwind
vextracti64x4 ymmword ptr [rsi+120h], zmm10, 1h
+ cmp rdx, 0Bh
+ jb unwind
vextracti64x4 ymmword ptr [rsi+140h], zmm12, 1h
+ cmp rdx, 0Ch
+ jb unwind
vextracti64x4 ymmword ptr [rsi+160h], zmm14, 1h
+ cmp rdx, 0Dh
+ jb unwind
vextracti64x4 ymmword ptr [rsi+180h], zmm0, 1h
+ cmp rdx, 0Eh
+ jb unwind
vextracti64x4 ymmword ptr [rsi+1A0h], zmm1, 1h
+ cmp rdx, 0Fh
+ jb unwind
vextracti64x4 ymmword ptr [rsi+1C0h], zmm2, 1h
+ cmp rdx, 10h
+ jb unwind
vextracti64x4 ymmword ptr [rsi+1E0h], zmm3, 1h
vmovdqa32 zmm8, zmmword ptr [rsp]
vmovdqa32 zmm9, zmmword ptr [rsp+40h]
@@ -426,22 +483,22 @@ innerloop16:
mov qword ptr [rbp+90h], rsi
add rcx, 80h
sub rdx, 10h
- cmp rdx, 10h
- jnb outerloop16
+ cmp rdx, 8h
+ jnbe outerloop16
test rdx, rdx
- jnz final15blocks
+ jnz final8blocks
unwind:
vzeroupper
- movdqa xmm6, xmmword ptr [rbp-0A8h]
- movdqa xmm7, xmmword ptr [rbp-98h]
- movdqa xmm8, xmmword ptr [rbp-88h]
- movdqa xmm9, xmmword ptr [rbp-78h]
- movdqa xmm10, xmmword ptr [rbp-68h]
- movdqa xmm11, xmmword ptr [rbp-58h]
- movdqa xmm12, xmmword ptr [rbp-48h]
- movdqa xmm13, xmmword ptr [rbp-38h]
- movdqa xmm14, xmmword ptr [rbp-28h]
- movdqa xmm15, xmmword ptr [rbp-18h]
+ movaps xmm6, xmmword ptr [rbp-0A8h]
+ movaps xmm7, xmmword ptr [rbp-98h]
+ movaps xmm8, xmmword ptr [rbp-88h]
+ movaps xmm9, xmmword ptr [rbp-78h]
+ movaps xmm10, xmmword ptr [rbp-68h]
+ movaps xmm11, xmmword ptr [rbp-58h]
+ movaps xmm12, xmmword ptr [rbp-48h]
+ movaps xmm13, xmmword ptr [rbp-38h]
+ movaps xmm14, xmmword ptr [rbp-28h]
+ movaps xmm15, xmmword ptr [rbp-18h]
mov rsp, rbp
pop r15
pop r14
@@ -453,10 +510,9 @@ unwind:
pop rbx
ret
ALIGN 16
-final15blocks:
- mov rax, rsp
- test dl, 8h
- jz final7blocks
+final8blocks:
+ cmp dl, 4h
+ jbe final4blocks
vpbroadcastd ymm0, dword ptr [r9]
vpbroadcastd ymm1, dword ptr [r9+4h]
vpbroadcastd ymm2, dword ptr [r9+8h]
@@ -465,45 +521,50 @@ final15blocks:
vpbroadcastd ymm5, dword ptr [r9+14h]
vpbroadcastd ymm6, dword ptr [r9+18h]
vpbroadcastd ymm7, dword ptr [r9+1Ch]
- movzx ebx, byte ptr [rbp+78h]
- movzx esi, byte ptr [rbp+80h]
- or ebx, esi
- xor esi, esi
+ movzx eax, byte ptr [rbp+78h]
+ movzx ebx, byte ptr [rbp+80h]
+ or eax, ebx
+ xor ebx, ebx
innerloop8:
- movzx edi, byte ptr [rbp+88h]
- or edi, ebx
- add rsi, 40h
- cmp rsi, qword ptr [rsp+100h]
- cmovz ebx, edi
- mov dword ptr [rsp+80h], ebx
- mov ebx, 0CCh
- kmovw k2, ebx
- mov ebx, 33h
- kmovw k3, ebx
- mov rbx, qword ptr [rcx]
- mov rdi, qword ptr [rcx+20h]
- vmovups xmm8, xmmword ptr [rbx+rsi*1-40h]
- vinserti32x4 ymm8, ymm8, xmmword ptr [rdi+rsi*1-40h], 1h
- vmovups xmm12, xmmword ptr [rbx+rsi*1-30h]
- vinserti32x4 ymm12, ymm12, xmmword ptr [rdi+rsi*1-30h], 1h
- mov rbx, qword ptr [rcx+8h]
- mov rdi, qword ptr [rcx+28h]
- vmovups xmm9, xmmword ptr [rbx+rsi*1-40h]
- vinserti32x4 ymm9, ymm9, xmmword ptr [rdi+rsi*1-40h], 1h
- vmovups xmm13, xmmword ptr [rbx+rsi*1-30h]
- vinserti32x4 ymm13, ymm13, xmmword ptr [rdi+rsi*1-30h], 1h
- mov rbx, qword ptr [rcx+10h]
- mov rdi, qword ptr [rcx+30h]
- vmovups xmm10, xmmword ptr [rbx+rsi*1-40h]
- vinserti32x4 ymm10, ymm10, xmmword ptr [rdi+rsi*1-40h], 1h
- vmovups xmm14, xmmword ptr [rbx+rsi*1-30h]
- vinserti32x4 ymm14, ymm14, xmmword ptr [rdi+rsi*1-30h], 1h
- mov rbx, qword ptr [rcx+18h]
- mov rdi, qword ptr [rcx+38h]
- vmovups xmm11, xmmword ptr [rbx+rsi*1-40h]
- vinserti32x4 ymm11, ymm11, xmmword ptr [rdi+rsi*1-40h], 1h
- vmovups xmm15, xmmword ptr [rbx+rsi*1-30h]
- vinserti32x4 ymm15, ymm15, xmmword ptr [rdi+rsi*1-30h], 1h
+ movzx esi, byte ptr [rbp+88h]
+ or esi, eax
+ add rbx, 40h
+ cmp rbx, qword ptr [rsp+100h]
+ cmovz eax, esi
+ mov dword ptr [rsp+80h], eax
+ mov rax, qword ptr [rcx]
+ mov rsi, qword ptr [rcx+20h]
+ vmovups xmm8, xmmword ptr [rax+rbx*1-40h]
+ vinserti32x4 ymm8, ymm8, xmmword ptr [rsi+rbx*1-40h], 1h
+ vmovups xmm12, xmmword ptr [rax+rbx*1-30h]
+ vinserti32x4 ymm12, ymm12, xmmword ptr [rsi+rbx*1-30h], 1h
+ mov rax, qword ptr [rcx+8h]
+ vmovups xmm9, xmmword ptr [rax+rbx*1-40h]
+ vmovups xmm13, xmmword ptr [rax+rbx*1-30h]
+ cmp dl, 6h
+ jb @F
+ mov rsi, qword ptr [rcx+28h]
+ vinserti32x4 ymm9, ymm9, xmmword ptr [rsi+rbx*1-40h], 1h
+ vinserti32x4 ymm13, ymm13, xmmword ptr [rsi+rbx*1-30h], 1h
+@@:
+ mov rax, qword ptr [rcx+10h]
+ vmovups xmm10, xmmword ptr [rax+rbx*1-40h]
+ vmovups xmm14, xmmword ptr [rax+rbx*1-30h]
+ cmp dl, 7h
+ jb @F
+ mov rsi, qword ptr [rcx+30h]
+ vinserti32x4 ymm10, ymm10, xmmword ptr [rsi+rbx*1-40h], 1h
+ vinserti32x4 ymm14, ymm14, xmmword ptr [rsi+rbx*1-30h], 1h
+@@:
+ mov rax, qword ptr [rcx+18h]
+ vmovups xmm11, xmmword ptr [rax+rbx*1-40h]
+ vmovups xmm15, xmmword ptr [rax+rbx*1-30h]
+ cmp dl, 8h
+ jb @F
+ mov rsi, qword ptr [rcx+38h]
+ vinserti32x4 ymm11, ymm11, xmmword ptr [rsi+rbx*1-40h], 1h
+ vinserti32x4 ymm15, ymm15, xmmword ptr [rsi+rbx*1-30h], 1h
+@@:
vpunpckldq ymm24, ymm8, ymm9
vpunpckhdq ymm9, ymm8, ymm9
vpunpckldq ymm8, ymm10, ymm11
@@ -520,30 +581,39 @@ innerloop8:
vshufps ymm12, ymm10, ymm12, 0EEh
vshufps ymm10, ymm13, ymm15, 44h
vshufps ymm15, ymm13, ymm15, 0EEh
- mov rbx, qword ptr [rcx]
- mov rdi, qword ptr [rcx+20h]
- vmovups xmm16, xmmword ptr [rbx+rsi*1-20h]
- vinserti32x4 ymm16, ymm16, xmmword ptr [rdi+rsi*1-20h], 1h
- vmovups xmm20, xmmword ptr [rbx+rsi*1-10h]
- vinserti32x4 ymm20, ymm20, xmmword ptr [rdi+rsi*1-10h], 1h
- mov rbx, qword ptr [rcx+8h]
- mov rdi, qword ptr [rcx+28h]
- vmovups xmm17, xmmword ptr [rbx+rsi*1-20h]
- vinserti32x4 ymm17, ymm17, xmmword ptr [rdi+rsi*1-20h], 1h
- vmovups xmm21, xmmword ptr [rbx+rsi*1-10h]
- vinserti32x4 ymm21, ymm21, xmmword ptr [rdi+rsi*1-10h], 1h
- mov rbx, qword ptr [rcx+10h]
- mov rdi, qword ptr [rcx+30h]
- vmovups xmm18, xmmword ptr [rbx+rsi*1-20h]
- vinserti32x4 ymm18, ymm18, xmmword ptr [rdi+rsi*1-20h], 1h
- vmovups xmm22, xmmword ptr [rbx+rsi*1-10h]
- vinserti32x4 ymm22, ymm22, xmmword ptr [rdi+rsi*1-10h], 1h
- mov rbx, qword ptr [rcx+18h]
- mov rdi, qword ptr [rcx+38h]
- vmovups xmm19, xmmword ptr [rbx+rsi*1-20h]
- vinserti32x4 ymm19, ymm19, xmmword ptr [rdi+rsi*1-20h], 1h
- vmovups xmm23, xmmword ptr [rbx+rsi*1-10h]
- vinserti32x4 ymm23, ymm23, xmmword ptr [rdi+rsi*1-10h], 1h
+ mov rax, qword ptr [rcx]
+ mov rsi, qword ptr [rcx+20h]
+ vmovups xmm16, xmmword ptr [rax+rbx*1-20h]
+ vinserti32x4 ymm16, ymm16, xmmword ptr [rsi+rbx*1-20h], 1h
+ vmovups xmm20, xmmword ptr [rax+rbx*1-10h]
+ vinserti32x4 ymm20, ymm20, xmmword ptr [rsi+rbx*1-10h], 1h
+ mov rax, qword ptr [rcx+8h]
+ vmovups xmm17, xmmword ptr [rax+rbx*1-20h]
+ vmovups xmm21, xmmword ptr [rax+rbx*1-10h]
+ cmp dl, 6h
+ jb @F
+ mov rsi, qword ptr [rcx+28h]
+ vinserti32x4 ymm17, ymm17, xmmword ptr [rsi+rbx*1-20h], 1h
+ vinserti32x4 ymm21, ymm21, xmmword ptr [rsi+rbx*1-10h], 1h
+@@:
+ mov rax, qword ptr [rcx+10h]
+ vmovups xmm18, xmmword ptr [rax+rbx*1-20h]
+ vmovups xmm22, xmmword ptr [rax+rbx*1-10h]
+ cmp dl, 7h
+ jb @F
+ mov rsi, qword ptr [rcx+30h]
+ vinserti32x4 ymm18, ymm18, xmmword ptr [rsi+rbx*1-20h], 1h
+ vinserti32x4 ymm22, ymm22, xmmword ptr [rsi+rbx*1-10h], 1h
+@@:
+ mov rax, qword ptr [rcx+18h]
+ vmovups xmm19, xmmword ptr [rax+rbx*1-20h]
+ vmovups xmm23, xmmword ptr [rax+rbx*1-10h]
+ cmp dl, 8h
+ jb @F
+ mov rsi, qword ptr [rcx+38h]
+ vinserti32x4 ymm19, ymm19, xmmword ptr [rsi+rbx*1-20h], 1h
+ vinserti32x4 ymm23, ymm23, xmmword ptr [rsi+rbx*1-10h], 1h
+@@:
vpunpckldq ymm13, ymm16, ymm17
vpunpckhdq ymm17, ymm16, ymm17
vpunpckldq ymm16, ymm18, ymm19
@@ -564,11 +634,11 @@ innerloop8:
vpbroadcastd ymm25, dword ptr [BLAKE3_IV_1]
vpbroadcastd ymm26, dword ptr [BLAKE3_IV_2]
vpbroadcastd ymm27, dword ptr [BLAKE3_IV_3]
- vmovdqa32 ymm28, ymmword ptr [rax]
- vmovdqa32 ymm29, ymmword ptr [rax+40h]
+ vmovdqa32 ymm28, ymmword ptr [rsp]
+ vmovdqa32 ymm29, ymmword ptr [rsp+40h]
vpbroadcastd ymm30, dword ptr [BLAKE3_BLOCK_LEN]
vpbroadcastd ymm31, dword ptr [rsp+80h]
- mov bl, 7h
+ mov al, 7h
@@:
vpaddd ymm0, ymm0, ymm14
vpaddd ymm1, ymm1, ymm24
@@ -700,7 +770,7 @@ innerloop8:
vprord ymm7, ymm7, 7h
vprord ymm4, ymm4, 7h
vmovdqa32 ymm8, ymmword ptr [rsp+0C0h]
- dec bl
+ dec al
jnz @B
vpxord ymm0, ymm0, ymm21
vpxord ymm1, ymm1, ymm25
@@ -710,79 +780,85 @@ innerloop8:
vpxord ymm5, ymm5, ymm29
vpxord ymm6, ymm6, ymm30
vpxord ymm7, ymm7, ymm31
- movzx ebx, byte ptr [rbp+78h]
+ movzx eax, byte ptr [rbp+78h]
+ cmp rbx, qword ptr [rsp+100h]
jb innerloop8
- mov rdi, qword ptr [rbp+90h]
- vunpcklps ymm8, ymm0, ymm1
- vunpcklps ymm9, ymm2, ymm3
- vunpckhps ymm10, ymm0, ymm1
- vunpcklps ymm11, ymm4, ymm5
- vunpcklps ymm0, ymm6, ymm7
+ mov rsi, qword ptr [rbp+90h]
+ vpunpckldq ymm8, ymm0, ymm1
+ vpunpckldq ymm9, ymm2, ymm3
+ vpunpckhdq ymm10, ymm0, ymm1
+ vpunpckldq ymm11, ymm4, ymm5
+ vpunpckldq ymm0, ymm6, ymm7
vshufps ymm12, ymm8, ymm9, 4Eh
- vblendps ymm1, ymm8, ymm12, 0CCh
+ vpblendd ymm1, ymm8, ymm12, 0CCh
vshufps ymm8, ymm11, ymm0, 4Eh
- vunpckhps ymm13, ymm2, ymm3
- vblendps ymm2, ymm11, ymm8, 0CCh
- vblendps ymm3, ymm12, ymm9, 0CCh
- vperm2f128 ymm12, ymm1, ymm2, 20h
- vmovups ymmword ptr [rdi], ymm12
- vunpckhps ymm14, ymm4, ymm5
- vblendps ymm4, ymm8, ymm0, 0CCh
- vunpckhps ymm15, ymm6, ymm7
- vperm2f128 ymm7, ymm3, ymm4, 20h
- vmovups ymmword ptr [rdi+20h], ymm7
+ vpunpckhdq ymm13, ymm2, ymm3
+ vpblendd ymm2, ymm11, ymm8, 0CCh
+ vpblendd ymm3, ymm12, ymm9, 0CCh
+ vperm2i128 ymm12, ymm1, ymm2, 20h
+ vmovdqu ymmword ptr [rsi], ymm12
+ vpunpckhdq ymm14, ymm4, ymm5
+ vpblendd ymm4, ymm8, ymm0, 0CCh
+ vpunpckhdq ymm15, ymm6, ymm7
+ vperm2i128 ymm7, ymm3, ymm4, 20h
+ vmovdqu ymmword ptr [rsi+20h], ymm7
vshufps ymm5, ymm10, ymm13, 4Eh
- vblendps ymm6, ymm5, ymm13, 0CCh
+ vpblendd ymm6, ymm5, ymm13, 0CCh
vshufps ymm13, ymm14, ymm15, 4Eh
- vblendps ymm10, ymm10, ymm5, 0CCh
- vblendps ymm14, ymm14, ymm13, 0CCh
- vperm2f128 ymm8, ymm10, ymm14, 20h
- vmovups ymmword ptr [rdi+40h], ymm8
- vblendps ymm15, ymm13, ymm15, 0CCh
- vperm2f128 ymm13, ymm6, ymm15, 20h
- vmovups ymmword ptr [rdi+60h], ymm13
- vperm2f128 ymm9, ymm1, ymm2, 31h
- vperm2f128 ymm11, ymm3, ymm4, 31h
- vmovups ymmword ptr [rdi+80h], ymm9
- vperm2f128 ymm14, ymm10, ymm14, 31h
- vperm2f128 ymm15, ymm6, ymm15, 31h
- vmovups ymmword ptr [rdi+0A0h], ymm11
- vmovups ymmword ptr [rdi+0C0h], ymm14
- vmovups ymmword ptr [rdi+0E0h], ymm15
- lea r8, qword ptr [rax+20h]
- kortestw k1, k1
- cmovnz rax, r8
- add rdi, 100h
- mov qword ptr [rbp+90h], rdi
- add rcx, 40h
-final7blocks:
+ vpblendd ymm10, ymm10, ymm5, 0CCh
+ vpblendd ymm14, ymm14, ymm13, 0CCh
+ vperm2i128 ymm8, ymm10, ymm14, 20h
+ vmovdqu ymmword ptr [rsi+40h], ymm8
+ vpblendd ymm15, ymm13, ymm15, 0CCh
+ vperm2i128 ymm13, ymm6, ymm15, 20h
+ vmovdqu ymmword ptr [rsi+60h], ymm13
+ vperm2i128 ymm9, ymm1, ymm2, 31h
+ vmovdqu ymmword ptr [rsi+80h], ymm9
+ cmp dl, 6h
+ jb @F
+ vperm2i128 ymm11, ymm3, ymm4, 31h
+ vmovdqu ymmword ptr [rsi+0A0h], ymm11
+ cmp dl, 7h
+ jb @F
+ vperm2i128 ymm14, ymm10, ymm14, 31h
+ vmovdqu ymmword ptr [rsi+0C0h], ymm14
+ cmp dl, 8h
+ jb @F
+ vperm2i128 ymm15, ymm6, ymm15, 31h
+ vmovdqu ymmword ptr [rsi+0E0h], ymm15
+@@:
+ jmp unwind
+final4blocks:
+ mov rax, qword ptr [rsp+100h]
mov rbx, qword ptr [rbp+90h]
movzx esi, byte ptr [rbp+78h]
movzx edi, byte ptr [rbp+88h]
- test dl, 4h
- jz final3blocks
+ mov r8d, 0AAAAh
+ kmovw k2, r8d
+ mov r8d, 8888h
+ kmovw k3, r8d
+ cmp dl, 2h
+ jbe final2blocks
vbroadcasti32x4 zmm0, xmmword ptr [r9]
vbroadcasti32x4 zmm1, xmmword ptr [r9+10h]
vbroadcasti32x4 zmm4, xmmword ptr [BLAKE3_IV]
mov r8d, 4444h
- kmovw k2, r8d
- vmovdqa xmm6, xmmword ptr [rax]
- vmovdqa xmm7, xmmword ptr [rax+40h]
+ kmovw k4, r8d
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+40h]
+ vpbroadcastd zmm5, dword ptr [BLAKE3_BLOCK_LEN]
vpunpckldq xmm8, xmm6, xmm7
- vpunpckhdq xmm9, xmm6, xmm7
- vpermq ymm8, ymm8, 0DCh
- vpermq ymm9, ymm9, 0DCh
- vpbroadcastd zmm6, dword ptr [BLAKE3_BLOCK_LEN]
- vinserti64x4 zmm5, zmm8, ymm9, 1h
- vpblendmd zmm5 {k2}, zmm5, zmm6
+ vpunpckhdq xmm7, xmm6, xmm7
+ vinserti64x4 zmm8, zmm8, ymm7, 1h
+ vpermq zmm8, zmm8, 0DCh
+ vpblendmd zmm5 {k4}, zmm8, zmm5
mov r8, qword ptr [rcx]
mov r10, qword ptr [rcx+8h]
mov r11, qword ptr [rcx+10h]
+ cmp dl, 4h
+ jb @F
mov r12, qword ptr [rcx+18h]
- mov r13d, 0AAAAh
- kmovw k2, r13d
- mov r13d, 8888h
- kmovw k3, r13d
+@@:
movzx r13d, byte ptr [rbp+80h]
or r13d, esi
xor r14d, r14d
@@ -790,32 +866,34 @@ innerloop4:
movzx r15d, byte ptr [rbp+88h]
or r15d, r13d
add r14, 40h
- cmp r14, qword ptr [rsp+100h]
+ cmp r14, rax
cmovz r13d, r15d
mov dword ptr [rsp+80h], r13d
vmovdqa32 zmm2, zmm4
- vpbroadcastd zmm6, dword ptr [rsp+80h]
- vpblendmd zmm3 {k3}, zmm5, zmm6
+ vpblendmd zmm3 {k3}, zmm5, dword bcst [rsp+80h]
vmovdqu32 zmm10, zmmword ptr [r8+r14*1-40h]
- vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-40h], 1h
- vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-40h], 2h
- vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-40h], 3h
vmovdqu32 zmm11, zmmword ptr [r8+r14*1-30h]
+ vmovdqu32 zmm12, zmmword ptr [r8+r14*1-20h]
+ vmovdqu32 zmm13, zmmword ptr [r8+r14*1-10h]
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-40h], 1h
vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-30h], 1h
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r10+r14*1-20h], 1h
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r10+r14*1-10h], 1h
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-40h], 2h
vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-30h], 2h
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r11+r14*1-20h], 2h
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r11+r14*1-10h], 2h
+ cmp dl, 4h
+ jb @F
+ vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-40h], 3h
vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-30h], 3h
+ vinserti32x4 zmm12, zmm12, xmmword ptr [r12+r14*1-20h], 3h
+ vinserti32x4 zmm13, zmm13, xmmword ptr [r12+r14*1-10h], 3h
+@@:
vshufps zmm6, zmm10, zmm11, 88h
vshufps zmm7, zmm10, zmm11, 0DDh
- vmovdqu32 zmm10, zmmword ptr [r8+r14*1-20h]
- vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-20h], 1h
- vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-20h], 2h
- vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-20h], 3h
- vmovdqu32 zmm11, zmmword ptr [r8+r14*1-10h]
- vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-10h], 1h
- vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-10h], 2h
- vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-10h], 3h
- vshufps zmm8, zmm10, zmm11, 88h
- vshufps zmm9, zmm10, zmm11, 0DDh
+ vshufps zmm8, zmm12, zmm13, 88h
+ vshufps zmm9, zmm12, zmm13, 0DDh
vpshufd zmm8, zmm8, 93h
vpshufd zmm9, zmm9, 93h
mov r15b, 7h
@@ -856,24 +934,25 @@ innerloop4:
vpshufd zmm2, zmm2, 93h
dec r15b
jz @F
- vshufps zmm12, zmm6, zmm7, 0D6h
- vpshufd zmm13, zmm6, 0Fh
- vpshufd zmm6, zmm12, 39h
- vshufps zmm12, zmm8, zmm9, 0FAh
- vpblendmd zmm13 {k2}, zmm13, zmm12
- vpunpcklqdq zmm12, zmm9, zmm7
- vpblendmd zmm12 {k3}, zmm12, zmm8
- vpshufd zmm12, zmm12, 78h
+ vshufps zmm14, zmm6, zmm7, 0D6h
+ vpshufd zmm15, zmm6, 0Fh
+ vpshufd zmm6, zmm14, 39h
+ vshufps zmm14, zmm8, zmm9, 0FAh
+ vpblendmd zmm15 {k2}, zmm15, zmm14
+ vpunpcklqdq zmm14, zmm9, zmm7
+ vpblendmd zmm14 {k3}, zmm14, zmm8
+ vpshufd zmm14, zmm14, 78h
vpunpckhdq zmm7, zmm7, zmm9
vpunpckldq zmm8, zmm8, zmm7
vpshufd zmm9, zmm8, 1Eh
- vmovdqa32 zmm7, zmm13
- vmovdqa32 zmm8, zmm12
+ vmovdqa32 zmm7, zmm15
+ vmovdqa32 zmm8, zmm14
jmp @B
@@:
vpxord zmm0, zmm0, zmm2
vpxord zmm1, zmm1, zmm3
mov r13d, esi
+ cmp r14, rax
jb innerloop4
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+10h], xmm1
@@ -881,28 +960,33 @@ innerloop4:
vextracti128 xmmword ptr [rbx+30h], ymm1, 1h
vextracti32x4 xmmword ptr [rbx+40h], zmm0, 2h
vextracti32x4 xmmword ptr [rbx+50h], zmm1, 2h
+ cmp dl, 4h
+ jb @F
vextracti32x4 xmmword ptr [rbx+60h], zmm0, 3h
vextracti32x4 xmmword ptr [rbx+70h], zmm1, 3h
- lea r15, qword ptr [rax+10h]
- kortestw k1, k1
- cmovnz rax, r15
- add rbx, 80h
- add rcx, 20h
-final3blocks:
- test dl, 2h
- jz final1block
+@@:
+ jmp unwind
+final2blocks:
+ test dl, dl
+ jz unwind
vbroadcasti128 ymm0, xmmword ptr [r9]
vbroadcasti128 ymm1, xmmword ptr [r9+10h]
vbroadcasti128 ymm4, xmmword ptr [BLAKE3_IV]
- vmovd xmm5, dword ptr [rax]
- vpinsrd xmm5, xmm5, dword ptr [rax+40h], 1h
- vpinsrd xmm5, xmm5, dword ptr [BLAKE3_BLOCK_LEN], 2h
- vmovd xmm6, dword ptr [rax+4h]
- vpinsrd xmm6, xmm6, dword ptr [rax+44h], 1h
- vpinsrd xmm6, xmm6, dword ptr [BLAKE3_BLOCK_LEN], 2h
- vinserti128 ymm5, ymm5, xmm6, 1h
+ vmovdqa xmm6, xmmword ptr [rsp]
+ vmovdqa xmm7, xmmword ptr [rsp+40h]
+ mov r8d, 40h
+ vpbroadcastq ymm5, r8
+ mov r8d, 55h
+ kmovw k4, r8d
+ vpunpckldq xmm8, xmm6, xmm7
+ vpunpckhdq xmm7, xmm6, xmm7
+ vinserti128 ymm8, ymm8, xmm7, 1h
+ vpermq ymm5 {k4}, ymm8, 0DCh
mov r8, qword ptr [rcx]
+ cmp dl, 2h
+ jb @F
mov r10, qword ptr [rcx+8h]
+@@:
mov r11d, esi
movzx r12d, byte ptr [rbp+80h]
or r11d, r12d
@@ -911,24 +995,26 @@ innerloop2:
movzx r13d, byte ptr [rbp+88h]
or r13d, r11d
add r12, 40h
- cmp r12, qword ptr [rsp+100h]
+ cmp r12, rax
cmovz r11d, r13d
mov dword ptr [rsp+80h], r11d
vmovdqa ymm2, ymm4
- vpbroadcastd ymm6, dword ptr [rsp+80h]
- vpblendd ymm3, ymm5, ymm6, 88h
+ vpblendmd ymm3 {k3}, ymm5, dword bcst [rsp+80h]
vmovdqu ymm10, ymmword ptr [r8+r12*1-40h]
- vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-40h], 1h
vmovdqu ymm11, ymmword ptr [r8+r12*1-30h]
+ vmovdqu ymm12, ymmword ptr [r8+r12*1-20h]
+ vmovdqu ymm13, ymmword ptr [r8+r12*1-10h]
+ cmp dl, 2h
+ jb @F
+ vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-40h], 1h
vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-30h], 1h
+ vinserti128 ymm12, ymm12, xmmword ptr [r10+r12*1-20h], 1h
+ vinserti128 ymm13, ymm13, xmmword ptr [r10+r12*1-10h], 1h
+@@:
vshufps ymm6, ymm10, ymm11, 88h
vshufps ymm7, ymm10, ymm11, 0DDh
- vmovdqu ymm10, ymmword ptr [r8+r12*1-20h]
- vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-20h], 1h
- vmovdqu ymm11, ymmword ptr [r8+r12*1-10h]
- vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-10h], 1h
- vshufps ymm8, ymm10, ymm11, 88h
- vshufps ymm9, ymm10, ymm11, 0DDh
+ vshufps ymm8, ymm12, ymm13, 88h
+ vshufps ymm9, ymm12, ymm13, 0DDh
vpshufd ymm8, ymm8, 93h
vpshufd ymm9, ymm9, 93h
mov r13b, 7h
@@ -987,107 +1073,15 @@ innerloop2:
vpxor ymm0, ymm0, ymm2
vpxor ymm1, ymm1, ymm3
mov r11d, esi
+ cmp r12, rax
jb innerloop2
vmovdqu xmmword ptr [rbx], xmm0
vmovdqu xmmword ptr [rbx+10h], xmm1
+ cmp dl, 2h
+ jb @F
vextracti128 xmmword ptr [rbx+20h], ymm0, 1h
vextracti128 xmmword ptr [rbx+30h], ymm1, 1h
- lea r13, qword ptr [rax+8h]
- kortestw k1, k1
- cmovnz rax, r13
- add rbx, 40h
- add rcx, 10h
-final1block:
- test dl, 1h
- jz unwind
- vmovdqu xmm0, xmmword ptr [r9]
- vmovdqu xmm1, xmmword ptr [r9+10h]
- vmovdqa xmm4, xmmword ptr [BLAKE3_IV]
- vmovd xmm5, dword ptr [rax]
- vpinsrd xmm5, xmm5, dword ptr [rax+40h], 1h
- vpinsrd xmm5, xmm5, dword ptr [BLAKE3_BLOCK_LEN], 2h
- mov r8, qword ptr [rcx]
- mov r10d, esi
- movzx r11d, byte ptr [rbp+80h]
- or r10d, r11d
- xor r11d, r11d
-innerloop1:
- movzx r12d, byte ptr [rbp+88h]
- or r12d, r10d
- add r11, 40h
- cmp r11, qword ptr [rsp+100h]
- cmovz r10d, r12d
- vmovdqa xmm2, xmm4
- vpinsrd xmm3, xmm5, r10d, 3h
- vmovdqu xmm10, xmmword ptr [r8+r11*1-40h]
- vmovdqu xmm11, xmmword ptr [r8+r11*1-30h]
- vshufps xmm6, xmm10, xmm11, 88h
- vshufps xmm7, xmm10, xmm11, 0DDh
- vmovdqu xmm10, xmmword ptr [r8+r11*1-20h]
- vmovdqu xmm11, xmmword ptr [r8+r11*1-10h]
- vshufps xmm8, xmm10, xmm11, 88h
- vshufps xmm9, xmm10, xmm11, 0DDh
- vpshufd xmm8, xmm8, 93h
- vpshufd xmm9, xmm9, 93h
- mov r12b, 7h
-@@:
- vpaddd xmm0, xmm0, xmm6
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 10h
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0Ch
- vpaddd xmm0, xmm0, xmm7
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8h
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7h
- vpshufd xmm0, xmm0, 93h
- vpshufd xmm3, xmm3, 4Eh
- vpshufd xmm2, xmm2, 39h
- vpaddd xmm0, xmm0, xmm8
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 10h
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 0Ch
- vpaddd xmm0, xmm0, xmm9
- vpaddd xmm0, xmm0, xmm1
- vpxord xmm3, xmm3, xmm0
- vprord xmm3, xmm3, 8h
- vpaddd xmm2, xmm2, xmm3
- vpxord xmm1, xmm1, xmm2
- vprord xmm1, xmm1, 7h
- vpshufd xmm0, xmm0, 39h
- vpshufd xmm3, xmm3, 4Eh
- vpshufd xmm2, xmm2, 93h
- dec r12b
- jz @F
- vshufps xmm10, xmm6, xmm7, 0D6h
- vpshufd xmm11, xmm6, 0Fh
- vpshufd xmm6, xmm10, 39h
- vshufps xmm10, xmm8, xmm9, 0FAh
- vpblendd xmm11, xmm11, xmm10, 0AAh
- vpunpcklqdq xmm10, xmm9, xmm7
- vpblendd xmm10, xmm10, xmm8, 88h
- vpshufd xmm10, xmm10, 78h
- vpunpckhdq xmm7, xmm7, xmm9
- vpunpckldq xmm8, xmm8, xmm7
- vpshufd xmm9, xmm8, 1Eh
- vmovdqa xmm7, xmm11
- vmovdqa xmm8, xmm10
- jmp @B
@@:
- vpxor xmm0, xmm0, xmm2
- vpxor xmm1, xmm1, xmm3
- mov r10d, esi
- jb innerloop1
- vmovdqu xmmword ptr [rbx], xmm0
- vmovdqu xmmword ptr [rbx+10h], xmm1
jmp unwind
_blake3_hash_many_avx512 ENDP
blake3_hash_many_avx512 ENDP
@@ -1293,10 +1287,10 @@ _blake3_xof_many_avx512 PROC
cmp rax, 1h
jnbe slowpath
sub rsp, 48h
- movdqa xmmword ptr [rsp], xmm6
- movdqa xmmword ptr [rsp+10h], xmm7
- movdqa xmmword ptr [rsp+20h], xmm8
- movdqa xmmword ptr [rsp+30h], xmm9
+ movaps xmmword ptr [rsp], xmm6
+ movaps xmmword ptr [rsp+10h], xmm7
+ movaps xmmword ptr [rsp+20h], xmm8
+ movaps xmmword ptr [rsp+30h], xmm9
vmovdqu xmm0, xmmword ptr [rcx]
vmovdqu xmm1, xmmword ptr [rcx+10h]
movzx r8d, r8b
@@ -1380,26 +1374,26 @@ _blake3_xof_many_avx512 PROC
vmovdqu xmmword ptr [r8+20h], xmm2
vmovdqu xmmword ptr [r8+30h], xmm3
vzeroupper
- movdqa xmm6, xmmword ptr [rsp]
- movdqa xmm7, xmmword ptr [rsp+10h]
- movdqa xmm8, xmmword ptr [rsp+20h]
- movdqa xmm9, xmmword ptr [rsp+30h]
+ movaps xmm6, xmmword ptr [rsp]
+ movaps xmm7, xmmword ptr [rsp+10h]
+ movaps xmm8, xmmword ptr [rsp+20h]
+ movaps xmm9, xmmword ptr [rsp+30h]
add rsp, 48h
ret
slowpath:
push rbp
mov rbp, rsp
sub rsp, 1A0h
- movdqa xmmword ptr [rbp-0A0h], xmm6
- movdqa xmmword ptr [rbp-90h], xmm7
- movdqa xmmword ptr [rbp-80h], xmm8
- movdqa xmmword ptr [rbp-70h], xmm9
- movdqa xmmword ptr [rbp-60h], xmm10
- movdqa xmmword ptr [rbp-50h], xmm11
- movdqa xmmword ptr [rbp-40h], xmm12
- movdqa xmmword ptr [rbp-30h], xmm13
- movdqa xmmword ptr [rbp-20h], xmm14
- movdqa xmmword ptr [rbp-10h], xmm15
+ movaps xmmword ptr [rbp-0A0h], xmm6
+ movaps xmmword ptr [rbp-90h], xmm7
+ movaps xmmword ptr [rbp-80h], xmm8
+ movaps xmmword ptr [rbp-70h], xmm9
+ movaps xmmword ptr [rbp-60h], xmm10
+ movaps xmmword ptr [rbp-50h], xmm11
+ movaps xmmword ptr [rbp-40h], xmm12
+ movaps xmmword ptr [rbp-30h], xmm13
+ movaps xmmword ptr [rbp-20h], xmm14
+ movaps xmmword ptr [rbp-10h], xmm15
and rsp, -40h
vpbroadcastd zmm0, r9d
shr r9, 20h
@@ -1711,23 +1705,23 @@ innerloop16:
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+40h], zmm1
add r9, 400h
- cmp rax, 18h
- lea rax, qword ptr [rax-10h]
+ sub rax, 10h
+ cmp rax, 8h
jnbe innerloop16
test al, al
jnz final8blocks
unwind:
vzeroupper
- movdqa xmm6, xmmword ptr [rbp-0A0h]
- movdqa xmm7, xmmword ptr [rbp-90h]
- movdqa xmm8, xmmword ptr [rbp-80h]
- movdqa xmm9, xmmword ptr [rbp-70h]
- movdqa xmm10, xmmword ptr [rbp-60h]
- movdqa xmm11, xmmword ptr [rbp-50h]
- movdqa xmm12, xmmword ptr [rbp-40h]
- movdqa xmm13, xmmword ptr [rbp-30h]
- movdqa xmm14, xmmword ptr [rbp-20h]
- movdqa xmm15, xmmword ptr [rbp-10h]
+ movaps xmm6, xmmword ptr [rbp-0A0h]
+ movaps xmm7, xmmword ptr [rbp-90h]
+ movaps xmm8, xmmword ptr [rbp-80h]
+ movaps xmm9, xmmword ptr [rbp-70h]
+ movaps xmm10, xmmword ptr [rbp-60h]
+ movaps xmm11, xmmword ptr [rbp-50h]
+ movaps xmm12, xmmword ptr [rbp-40h]
+ movaps xmm13, xmmword ptr [rbp-30h]
+ movaps xmm14, xmmword ptr [rbp-20h]
+ movaps xmm15, xmmword ptr [rbp-10h]
mov rsp, rbp
pop rbp
ret