diff options
Diffstat (limited to 'c/blake3_avx512_x86-64_unix.S')
| -rw-r--r-- | c/blake3_avx512_x86-64_unix.S | 588 |
1 files changed, 291 insertions, 297 deletions
diff --git a/c/blake3_avx512_x86-64_unix.S b/c/blake3_avx512_x86-64_unix.S index 9b82424..868e9f8 100644 --- a/c/blake3_avx512_x86-64_unix.S +++ b/c/blake3_avx512_x86-64_unix.S @@ -46,7 +46,7 @@ blake3_hash_many_avx512: vpbroadcastd ymm0, r8d shr r8, 0x20 vpbroadcastd ymm1, r8d - vmovdqa32 ymm2 {k1} {z}, ymmword ptr [rip+ADD0+ 0] + vmovdqa32 ymm2 {k1} {z}, ymmword ptr [rip+ADD0+0] vmovdqa32 ymm3 {k1} {z}, ymmword ptr [rip+ADD0+32] vpaddd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp], ymm2 @@ -61,8 +61,8 @@ blake3_hash_many_avx512: vmovdqa ymmword ptr [rsp+0x60], ymm1 shl rdx, 0x06 mov qword ptr [rsp+0x100], rdx - cmp rsi, 0x10 - jb 5f + cmp rsi, 0x08 + jbe 5f .p2align 5 2: vpbroadcastd zmm0, dword ptr [rcx] @@ -89,39 +89,60 @@ blake3_hash_many_avx512: mov r8, qword ptr [rdi+0x10] mov r9, qword ptr [rdi+0x18] mov r10, qword ptr [rdi+0x40] - mov r11, qword ptr [rdi+0x48] - mov r12, qword ptr [rdi+0x50] - mov r13, qword ptr [rdi+0x58] vmovdqu32 ymm8, ymmword ptr [rax+rdx*1-0x40] vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rdx*1-0x40], 0x01 vmovdqu32 ymm9, ymmword ptr [rbx+rdx*1-0x40] + cmp rsi, 0x0A + jb 4f + mov r11, qword ptr [rdi+0x48] vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rdx*1-0x40], 0x01 +4: vpunpckldq zmm10, zmm8, zmm9 vpunpckhdq zmm11, zmm8, zmm9 vmovdqu32 ymm8, ymmword ptr [r8+rdx*1-0x40] + cmp rsi, 0x0B + jb 4f + mov r12, qword ptr [rdi+0x50] vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rdx*1-0x40], 0x01 +4: vmovdqu32 ymm9, ymmword ptr [r9+rdx*1-0x40] + cmp rsi, 0x0C + jb 4f + mov r13, qword ptr [rdi+0x58] vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rdx*1-0x40], 0x01 +4: vpunpckldq zmm12, zmm8, zmm9 vpunpckhdq zmm13, zmm8, zmm9 mov rax, qword ptr [rdi+0x20] mov rbx, qword ptr [rdi+0x28] mov r8, qword ptr [rdi+0x30] mov r9, qword ptr [rdi+0x38] - mov r10, qword ptr [rdi+0x60] - mov r11, qword ptr [rdi+0x68] - mov r12, qword ptr [rdi+0x70] - mov r13, qword ptr [rdi+0x78] vmovdqu32 ymm8, ymmword ptr [rax+rdx*1-0x40] + cmp rsi, 0x0D + jb 4f + mov r10, qword ptr [rdi+0x60] vinserti64x4 zmm8, zmm8, ymmword ptr [r10+rdx*1-0x40], 0x01 +4: vmovdqu32 ymm9, ymmword ptr [rbx+rdx*1-0x40] + cmp rsi, 0x0E + jb 4f + mov r11, qword ptr [rdi+0x68] vinserti64x4 zmm9, zmm9, ymmword ptr [r11+rdx*1-0x40], 0x01 +4: vpunpckldq zmm14, zmm8, zmm9 vpunpckhdq zmm15, zmm8, zmm9 vmovdqu32 ymm8, ymmword ptr [r8+rdx*1-0x40] + cmp rsi, 0x0F + jb 4f + mov r12, qword ptr [rdi+0x70] vinserti64x4 zmm8, zmm8, ymmword ptr [r12+rdx*1-0x40], 0x01 +4: vmovdqu32 ymm9, ymmword ptr [r9+rdx*1-0x40] + cmp rsi, 0x10 + jb 4f + mov r13, qword ptr [rdi+0x78] vinserti64x4 zmm9, zmm9, ymmword ptr [r13+rdx*1-0x40], 0x01 +4: vpunpckldq zmm16, zmm8, zmm9 vpunpckhdq zmm17, zmm8, zmm9 vmovdqa32 zmm8, zmmword ptr [rip+INDEX0] @@ -151,19 +172,31 @@ blake3_hash_many_avx512: mov r8, qword ptr [rdi+0x10] mov r9, qword ptr [rdi+0x18] mov r10, qword ptr [rdi+0x40] - mov r11, qword ptr [rdi+0x48] - mov r12, qword ptr [rdi+0x50] - mov r13, qword ptr [rdi+0x58] vmovdqu32 ymm11, ymmword ptr [rax+rdx*1-0x20] vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rdx*1-0x20], 0x01 vmovdqu32 ymm13, ymmword ptr [rbx+rdx*1-0x20] + cmp rsi, 0x0A + jb 4f + mov r11, qword ptr [rdi+0x48] vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r11+rdx*1+0x80] +4: vpunpckldq zmm15, zmm11, zmm13 vpunpckhdq zmm17, zmm11, zmm13 vmovdqu32 ymm11, ymmword ptr [r8+rdx*1-0x20] + cmp rsi, 0x0B + jb 4f + mov r12, qword ptr [rdi+0x50] vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r13+rdx*1+0x80] +4: vmovdqu32 ymm13, ymmword ptr [r9+rdx*1-0x20] + cmp rsi, 0x0C + jb 4f + mov r13, qword ptr [rdi+0x58] vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r13+rdx*1+0x80] +4: vpunpckldq zmm22, zmm11, zmm13 vpunpckhdq zmm23, zmm11, zmm13 prefetcht0 byte ptr [rax+rdx*1+0x80] @@ -171,33 +204,42 @@ blake3_hash_many_avx512: prefetcht0 byte ptr [r8+rdx*1+0x80] prefetcht0 byte ptr [r9+rdx*1+0x80] prefetcht0 byte ptr [r10+rdx*1+0x80] - prefetcht0 byte ptr [r11+rdx*1+0x80] - prefetcht0 byte ptr [r12+rdx*1+0x80] - prefetcht0 byte ptr [r13+rdx*1+0x80] mov rax, qword ptr [rdi+0x20] mov rbx, qword ptr [rdi+0x28] mov r8, qword ptr [rdi+0x30] mov r9, qword ptr [rdi+0x38] - mov r10, qword ptr [rdi+0x60] - mov r11, qword ptr [rdi+0x68] - mov r12, qword ptr [rdi+0x70] - mov r13, qword ptr [rdi+0x78] vmovdqu32 ymm11, ymmword ptr [rax+rdx*1-0x20] + cmp rsi, 0x0D + jb 4f + mov r10, qword ptr [rdi+0x60] vinserti64x4 zmm11, zmm11, ymmword ptr [r10+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r10+rdx*1+0x80] +4: vmovdqu32 ymm13, ymmword ptr [rbx+rdx*1-0x20] + cmp rsi, 0x0E + jb 4f + mov r11, qword ptr [rdi+0x68] vinserti64x4 zmm13, zmm13, ymmword ptr [r11+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r11+rdx*1+0x80] +4: vpunpckldq zmm24, zmm11, zmm13 vpunpckhdq zmm25, zmm11, zmm13 vmovdqu32 ymm11, ymmword ptr [r8+rdx*1-0x20] + cmp rsi, 0x0F + jb 4f + mov r12, qword ptr [rdi+0x70] vinserti64x4 zmm11, zmm11, ymmword ptr [r12+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r12+rdx*1+0x80] +4: vmovdqu32 ymm13, ymmword ptr [r9+rdx*1-0x20] + cmp rsi, 0x10 + jb 4f + mov r13, qword ptr [rdi+0x78] vinserti64x4 zmm13, zmm13, ymmword ptr [r13+rdx*1-0x20], 0x01 + prefetcht0 byte ptr [r13+rdx*1+0x80] +4: vpunpckldq zmm26, zmm11, zmm13 vpunpckhdq zmm27, zmm11, zmm13 - prefetcht0 byte ptr [rax+rdx*1+0x80] - prefetcht0 byte ptr [rbx+rdx*1+0x80] - prefetcht0 byte ptr [r8+rdx*1+0x80] - prefetcht0 byte ptr [r9+rdx*1+0x80] prefetcht0 byte ptr [r10+rdx*1+0x80] prefetcht0 byte ptr [r11+rdx*1+0x80] prefetcht0 byte ptr [r12+rdx*1+0x80] @@ -372,6 +414,7 @@ blake3_hash_many_avx512: vpxord zmm6, zmm6, zmm30 vpxord zmm7, zmm7, zmm31 movzx eax, byte ptr [rbp+0x38] + cmp rdx, qword ptr [rsp+0x100] jb 3b mov rbx, qword ptr [rbp+0x50] vpunpckldq zmm8, zmm0, zmm2 @@ -413,12 +456,26 @@ blake3_hash_many_avx512: vextracti64x4 ymmword ptr [rbx+0xC0], zmm2, 0x00 vextracti64x4 ymmword ptr [rbx+0xE0], zmm3, 0x00 vextracti64x4 ymmword ptr [rbx+0x100], zmm8, 0x01 + cmp rsi, 0x0A + jb 9f vextracti64x4 ymmword ptr [rbx+0x120], zmm10, 0x01 + cmp rsi, 0x0B + jb 9f vextracti64x4 ymmword ptr [rbx+0x140], zmm12, 0x01 + cmp rsi, 0x0C + jb 9f vextracti64x4 ymmword ptr [rbx+0x160], zmm14, 0x01 + cmp rsi, 0x0D + jb 9f vextracti64x4 ymmword ptr [rbx+0x180], zmm0, 0x01 + cmp rsi, 0x0E + jb 9f vextracti64x4 ymmword ptr [rbx+0x1A0], zmm1, 0x01 + cmp rsi, 0x0F + jb 9f vextracti64x4 ymmword ptr [rbx+0x1C0], zmm2, 0x01 + cmp rsi, 0x10 + jb 9f vextracti64x4 ymmword ptr [rbx+0x1E0], zmm3, 0x01 vmovdqa32 zmm8, zmmword ptr [rsp] vmovdqa32 zmm9, zmmword ptr [rsp+0x40] @@ -432,8 +489,8 @@ blake3_hash_many_avx512: mov qword ptr [rbp+0x50], rbx add rdi, 0x80 sub rsi, 0x10 - cmp rsi, 0x10 - jnb 2b + cmp rsi, 0x08 + jnbe 2b test esi, esi jnz 5f 9: @@ -448,9 +505,8 @@ blake3_hash_many_avx512: ret .p2align 6 5: - mov rax, rsp - test sil, 0x08 - jz 3f + cmp sil, 0x04 + jbe 3f vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x04] vpbroadcastd ymm2, dword ptr [rcx+0x08] @@ -459,45 +515,50 @@ blake3_hash_many_avx512: vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] - movzx edx, byte ptr [rbp+0x38] - movzx ebx, byte ptr [rbp+0x40] - or edx, ebx - xor ebx, ebx + movzx eax, byte ptr [rbp+0x38] + movzx edx, byte ptr [rbp+0x40] + or eax, edx + xor edx, edx 2: - movzx r8d, byte ptr [rbp+0x48] - or r8d, edx - add rbx, 0x40 - cmp rbx, qword ptr [rsp+0x100] - cmovz edx, r8d - mov dword ptr [rsp+0x80], edx - mov edx, 0xCC - kmovw k2, edx - mov edx, 0x33 - kmovw k3, edx - mov rdx, qword ptr [rdi] - mov r8, qword ptr [rdi+0x20] - vmovups xmm8, xmmword ptr [rdx+rbx*1-0x40] - vinserti32x4 ymm8, ymm8, xmmword ptr [r8+rbx*1-0x40], 0x01 - vmovups xmm12, xmmword ptr [rdx+rbx*1-0x30] - vinserti32x4 ymm12, ymm12, xmmword ptr [r8+rbx*1-0x30], 0x01 - mov rdx, qword ptr [rdi+0x08] - mov r8, qword ptr [rdi+0x28] - vmovups xmm9, xmmword ptr [rdx+rbx*1-0x40] - vinserti32x4 ymm9, ymm9, xmmword ptr [r8+rbx*1-0x40], 0x01 - vmovups xmm13, xmmword ptr [rdx+rbx*1-0x30] - vinserti32x4 ymm13, ymm13, xmmword ptr [r8+rbx*1-0x30], 0x01 - mov rdx, qword ptr [rdi+0x10] - mov r8, qword ptr [rdi+0x30] - vmovups xmm10, xmmword ptr [rdx+rbx*1-0x40] - vinserti32x4 ymm10, ymm10, xmmword ptr [r8+rbx*1-0x40], 0x01 - vmovups xmm14, xmmword ptr [rdx+rbx*1-0x30] - vinserti32x4 ymm14, ymm14, xmmword ptr [r8+rbx*1-0x30], 0x01 - mov rdx, qword ptr [rdi+0x18] - mov r8, qword ptr [rdi+0x38] - vmovups xmm11, xmmword ptr [rdx+rbx*1-0x40] - vinserti32x4 ymm11, ymm11, xmmword ptr [r8+rbx*1-0x40], 0x01 - vmovups xmm15, xmmword ptr [rdx+rbx*1-0x30] - vinserti32x4 ymm15, ymm15, xmmword ptr [r8+rbx*1-0x30], 0x01 + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 0x40 + cmp rdx, qword ptr [rsp+0x100] + cmovz eax, ebx + mov dword ptr [rsp+0x80], eax + mov rax, qword ptr [rdi] + mov rbx, qword ptr [rdi+0x20] + vmovups xmm8, xmmword ptr [rax+rdx*1-0x40] + vinserti32x4 ymm8, ymm8, xmmword ptr [rbx+rdx*1-0x40], 0x01 + vmovups xmm12, xmmword ptr [rax+rdx*1-0x30] + vinserti32x4 ymm12, ymm12, xmmword ptr [rbx+rdx*1-0x30], 0x01 + mov rax, qword ptr [rdi+0x08] + vmovups xmm9, xmmword ptr [rax+rdx*1-0x40] + vmovups xmm13, xmmword ptr [rax+rdx*1-0x30] + cmp sil, 0x06 + jb 4f + mov rbx, qword ptr [rdi+0x28] + vinserti32x4 ymm9, ymm9, xmmword ptr [rbx+rdx*1-0x40], 0x01 + vinserti32x4 ymm13, ymm13, xmmword ptr [rbx+rdx*1-0x30], 0x01 +4: + mov rax, qword ptr [rdi+0x10] + vmovups xmm10, xmmword ptr [rax+rdx*1-0x40] + vmovups xmm14, xmmword ptr [rax+rdx*1-0x30] + cmp sil, 0x07 + jb 4f + mov rbx, qword ptr [rdi+0x30] + vinserti32x4 ymm10, ymm10, xmmword ptr [rbx+rdx*1-0x40], 0x01 + vinserti32x4 ymm14, ymm14, xmmword ptr [rbx+rdx*1-0x30], 0x01 +4: + mov rax, qword ptr [rdi+0x18] + vmovups xmm11, xmmword ptr [rax+rdx*1-0x40] + vmovups xmm15, xmmword ptr [rax+rdx*1-0x30] + cmp sil, 0x08 + jb 4f + mov rbx, qword ptr [rdi+0x38] + vinserti32x4 ymm11, ymm11, xmmword ptr [rbx+rdx*1-0x40], 0x01 + vinserti32x4 ymm15, ymm15, xmmword ptr [rbx+rdx*1-0x30], 0x01 +4: vpunpckldq ymm24, ymm8, ymm9 vpunpckhdq ymm9, ymm8, ymm9 vpunpckldq ymm8, ymm10, ymm11 @@ -514,30 +575,39 @@ blake3_hash_many_avx512: vshufps ymm12, ymm10, ymm12, 0xEE vshufps ymm10, ymm13, ymm15, 0x44 vshufps ymm15, ymm13, ymm15, 0xEE - mov rdx, qword ptr [rdi] - mov r8, qword ptr [rdi+0x20] - vmovups xmm16, xmmword ptr [rdx+rbx*1-0x20] - vinserti32x4 ymm16, ymm16, xmmword ptr [r8+rbx*1-0x20], 0x01 - vmovups xmm20, xmmword ptr [rdx+rbx*1-0x10] - vinserti32x4 ymm20, ymm20, xmmword ptr [r8+rbx*1-0x10], 0x01 - mov rdx, qword ptr [rdi+0x08] - mov r8, qword ptr [rdi+0x28] - vmovups xmm17, xmmword ptr [rdx+rbx*1-0x20] - vinserti32x4 ymm17, ymm17, xmmword ptr [r8+rbx*1-0x20], 0x01 - vmovups xmm21, xmmword ptr [rdx+rbx*1-0x10] - vinserti32x4 ymm21, ymm21, xmmword ptr [r8+rbx*1-0x10], 0x01 - mov rdx, qword ptr [rdi+0x10] - mov r8, qword ptr [rdi+0x30] - vmovups xmm18, xmmword ptr [rdx+rbx*1-0x20] - vinserti32x4 ymm18, ymm18, xmmword ptr [r8+rbx*1-0x20], 0x01 - vmovups xmm22, xmmword ptr [rdx+rbx*1-0x10] - vinserti32x4 ymm22, ymm22, xmmword ptr [r8+rbx*1-0x10], 0x01 - mov rdx, qword ptr [rdi+0x18] - mov r8, qword ptr [rdi+0x38] - vmovups xmm19, xmmword ptr [rdx+rbx*1-0x20] - vinserti32x4 ymm19, ymm19, xmmword ptr [r8+rbx*1-0x20], 0x01 - vmovups xmm23, xmmword ptr [rdx+rbx*1-0x10] - vinserti32x4 ymm23, ymm23, xmmword ptr [r8+rbx*1-0x10], 0x01 + mov rax, qword ptr [rdi] + mov rbx, qword ptr [rdi+0x20] + vmovups xmm16, xmmword ptr [rax+rdx*1-0x20] + vinserti32x4 ymm16, ymm16, xmmword ptr [rbx+rdx*1-0x20], 0x01 + vmovups xmm20, xmmword ptr [rax+rdx*1-0x10] + vinserti32x4 ymm20, ymm20, xmmword ptr [rbx+rdx*1-0x10], 0x01 + mov rax, qword ptr [rdi+0x08] + vmovups xmm17, xmmword ptr [rax+rdx*1-0x20] + vmovups xmm21, xmmword ptr [rax+rdx*1-0x10] + cmp sil, 0x06 + jb 4f + mov rbx, qword ptr [rdi+0x28] + vinserti32x4 ymm17, ymm17, xmmword ptr [rbx+rdx*1-0x20], 0x01 + vinserti32x4 ymm21, ymm21, xmmword ptr [rbx+rdx*1-0x10], 0x01 +4: + mov rax, qword ptr [rdi+0x10] + vmovups xmm18, xmmword ptr [rax+rdx*1-0x20] + vmovups xmm22, xmmword ptr [rax+rdx*1-0x10] + cmp sil, 0x07 + jb 4f + mov rbx, qword ptr [rdi+0x30] + vinserti32x4 ymm18, ymm18, xmmword ptr [rbx+rdx*1-0x20], 0x01 + vinserti32x4 ymm22, ymm22, xmmword ptr [rbx+rdx*1-0x10], 0x01 +4: + mov rax, qword ptr [rdi+0x18] + vmovups xmm19, xmmword ptr [rax+rdx*1-0x20] + vmovups xmm23, xmmword ptr [rax+rdx*1-0x10] + cmp sil, 0x08 + jb 4f + mov rbx, qword ptr [rdi+0x38] + vinserti32x4 ymm19, ymm19, xmmword ptr [rbx+rdx*1-0x20], 0x01 + vinserti32x4 ymm23, ymm23, xmmword ptr [rbx+rdx*1-0x10], 0x01 +4: vpunpckldq ymm13, ymm16, ymm17 vpunpckhdq ymm17, ymm16, ymm17 vpunpckldq ymm16, ymm18, ymm19 @@ -558,11 +628,11 @@ blake3_hash_many_avx512: vpbroadcastd ymm25, dword ptr [rip+BLAKE3_IV_1] vpbroadcastd ymm26, dword ptr [rip+BLAKE3_IV_2] vpbroadcastd ymm27, dword ptr [rip+BLAKE3_IV_3] - vmovdqa32 ymm28, ymmword ptr [rax] - vmovdqa32 ymm29, ymmword ptr [rax+0x40] + vmovdqa32 ymm28, ymmword ptr [rsp] + vmovdqa32 ymm29, ymmword ptr [rsp+0x40] vpbroadcastd ymm30, dword ptr [rip+BLAKE3_BLOCK_LEN] vpbroadcastd ymm31, dword ptr [rsp+0x80] - mov dl, 0x07 + mov al, 0x07 4: vpaddd ymm0, ymm0, ymm14 vpaddd ymm1, ymm1, ymm24 @@ -694,7 +764,7 @@ blake3_hash_many_avx512: vprord ymm7, ymm7, 0x07 vprord ymm4, ymm4, 0x07 vmovdqa32 ymm8, ymmword ptr [rsp+0xC0] - dec dl + dec al jnz 4b vpxord ymm0, ymm0, ymm21 vpxord ymm1, ymm1, ymm25 @@ -704,79 +774,85 @@ blake3_hash_many_avx512: vpxord ymm5, ymm5, ymm29 vpxord ymm6, ymm6, ymm30 vpxord ymm7, ymm7, ymm31 - movzx edx, byte ptr [rbp+0x38] + movzx eax, byte ptr [rbp+0x38] + cmp rdx, qword ptr [rsp+0x100] jb 2b - mov r8, qword ptr [rbp+0x50] - vunpcklps ymm8, ymm0, ymm1 - vunpcklps ymm9, ymm2, ymm3 - vunpckhps ymm10, ymm0, ymm1 - vunpcklps ymm11, ymm4, ymm5 - vunpcklps ymm0, ymm6, ymm7 + mov rbx, qword ptr [rbp+0x50] + vpunpckldq ymm8, ymm0, ymm1 + vpunpckldq ymm9, ymm2, ymm3 + vpunpckhdq ymm10, ymm0, ymm1 + vpunpckldq ymm11, ymm4, ymm5 + vpunpckldq ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 0x4E - vblendps ymm1, ymm8, ymm12, 0xCC + vpblendd ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 0x4E - vunpckhps ymm13, ymm2, ymm3 - vblendps ymm2, ymm11, ymm8, 0xCC - vblendps ymm3, ymm12, ymm9, 0xCC - vperm2f128 ymm12, ymm1, ymm2, 0x20 - vmovups ymmword ptr [r8], ymm12 - vunpckhps ymm14, ymm4, ymm5 - vblendps ymm4, ymm8, ymm0, 0xCC - vunpckhps ymm15, ymm6, ymm7 - vperm2f128 ymm7, ymm3, ymm4, 0x20 - vmovups ymmword ptr [r8+0x20], ymm7 + vpunpckhdq ymm13, ymm2, ymm3 + vpblendd ymm2, ymm11, ymm8, 0xCC + vpblendd ymm3, ymm12, ymm9, 0xCC + vperm2i128 ymm12, ymm1, ymm2, 0x20 + vmovdqu ymmword ptr [rbx], ymm12 + vpunpckhdq ymm14, ymm4, ymm5 + vpblendd ymm4, ymm8, ymm0, 0xCC + vpunpckhdq ymm15, ymm6, ymm7 + vperm2i128 ymm7, ymm3, ymm4, 0x20 + vmovdqu ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 0x4E - vblendps ymm6, ymm5, ymm13, 0xCC + vpblendd ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 0x4E - vblendps ymm10, ymm10, ymm5, 0xCC - vblendps ymm14, ymm14, ymm13, 0xCC - vperm2f128 ymm8, ymm10, ymm14, 0x20 - vmovups ymmword ptr [r8+0x40], ymm8 - vblendps ymm15, ymm13, ymm15, 0xCC - vperm2f128 ymm13, ymm6, ymm15, 0x20 - vmovups ymmword ptr [r8+0x60], ymm13 - vperm2f128 ymm9, ymm1, ymm2, 0x31 - vperm2f128 ymm11, ymm3, ymm4, 0x31 - vmovups ymmword ptr [r8+0x80], ymm9 - vperm2f128 ymm14, ymm10, ymm14, 0x31 - vperm2f128 ymm15, ymm6, ymm15, 0x31 - vmovups ymmword ptr [r8+0xA0], ymm11 - vmovups ymmword ptr [r8+0xC0], ymm14 - vmovups ymmword ptr [r8+0xE0], ymm15 - lea r9, qword ptr [rax+0x20] - kortestw k1, k1 - cmovnz rax, r9 - add r8, 0x100 - mov qword ptr [rbp+0x50], r8 - add rdi, 0x40 + vpblendd ymm10, ymm10, ymm5, 0xCC + vpblendd ymm14, ymm14, ymm13, 0xCC + vperm2i128 ymm8, ymm10, ymm14, 0x20 + vmovdqu ymmword ptr [rbx+0x40], ymm8 + vpblendd ymm15, ymm13, ymm15, 0xCC + vperm2i128 ymm13, ymm6, ymm15, 0x20 + vmovdqu ymmword ptr [rbx+0x60], ymm13 + vperm2i128 ymm9, ymm1, ymm2, 0x31 + vmovdqu ymmword ptr [rbx+0x80], ymm9 + cmp sil, 0x06 + jb 4f + vperm2i128 ymm11, ymm3, ymm4, 0x31 + vmovdqu ymmword ptr [rbx+0xA0], ymm11 + cmp sil, 0x07 + jb 4f + vperm2i128 ymm14, ymm10, ymm14, 0x31 + vmovdqu ymmword ptr [rbx+0xC0], ymm14 + cmp sil, 0x08 + jb 4f + vperm2i128 ymm15, ymm6, ymm15, 0x31 + vmovdqu ymmword ptr [rbx+0xE0], ymm15 +4: + jmp 9b 3: + mov rax, qword ptr [rsp+0x100] mov rdx, qword ptr [rbp+0x50] movzx ebx, byte ptr [rbp+0x38] movzx r8d, byte ptr [rbp+0x48] - test sil, 0x04 - jz 3f + mov r9d, 0xAAAA + kmovw k2, r9d + mov r9d, 0x8888 + kmovw k3, r9d + cmp sil, 0x02 + jbe 3f vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x10] vbroadcasti32x4 zmm4, xmmword ptr [rip+BLAKE3_IV] mov r9d, 0x4444 - kmovw k2, r9d - vmovdqa xmm6, xmmword ptr [rax] - vmovdqa xmm7, xmmword ptr [rax+0x40] + kmovw k4, r9d + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x40] + vpbroadcastd zmm5, dword ptr [rip+BLAKE3_BLOCK_LEN] vpunpckldq xmm8, xmm6, xmm7 - vpunpckhdq xmm9, xmm6, xmm7 - vpermq ymm8, ymm8, 0xDC - vpermq ymm9, ymm9, 0xDC - vpbroadcastd zmm6, dword ptr [rip+BLAKE3_BLOCK_LEN] - vinserti64x4 zmm5, zmm8, ymm9, 0x01 - vpblendmd zmm5 {k2}, zmm5, zmm6 + vpunpckhdq xmm7, xmm6, xmm7 + vinserti64x4 zmm8, zmm8, ymm7, 0x01 + vpermq zmm8, zmm8, 0xDC + vpblendmd zmm5 {k4}, zmm8, zmm5 mov r9, qword ptr [rdi] mov r10, qword ptr [rdi+0x08] mov r11, qword ptr [rdi+0x10] + cmp sil, 0x04 + jb 4f mov r12, qword ptr [rdi+0x18] - mov r13d, 0xAAAA - kmovw k2, r13d - mov r13d, 0x8888 - kmovw k3, r13d +4: movzx r13d, byte ptr [rbp+0x40] or r13d, ebx xor r14d, r14d @@ -784,32 +860,34 @@ blake3_hash_many_avx512: movzx r15d, byte ptr [rbp+0x48] or r15d, r13d add r14, 0x40 - cmp r14, qword ptr [rsp+0x100] + cmp r14, rax cmovz r13d, r15d mov dword ptr [rsp+0x80], r13d vmovdqa32 zmm2, zmm4 - vpbroadcastd zmm6, dword ptr [rsp+0x80] - vpblendmd zmm3 {k3}, zmm5, zmm6 + vpblendmd zmm3 {k3}, zmm5, dword ptr [rsp+0x80] {1to16} vmovdqu32 zmm10, zmmword ptr [r9+r14*1-0x40] - vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x40], 0x01 - vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x40], 0x02 - vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x40], 0x03 vmovdqu32 zmm11, zmmword ptr [r9+r14*1-0x30] + vmovdqu32 zmm12, zmmword ptr [r9+r14*1-0x20] + vmovdqu32 zmm13, zmmword ptr [r9+r14*1-0x10] + vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x40], 0x01 vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-0x30], 0x01 + vinserti32x4 zmm12, zmm12, xmmword ptr [r10+r14*1-0x20], 0x01 + vinserti32x4 zmm13, zmm13, xmmword ptr [r10+r14*1-0x10], 0x01 + vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x40], 0x02 vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-0x30], 0x02 + vinserti32x4 zmm12, zmm12, xmmword ptr [r11+r14*1-0x20], 0x02 + vinserti32x4 zmm13, zmm13, xmmword ptr [r11+r14*1-0x10], 0x02 + cmp sil, 0x04 + jb 4f + vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x40], 0x03 vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-0x30], 0x03 + vinserti32x4 zmm12, zmm12, xmmword ptr [r12+r14*1-0x20], 0x03 + vinserti32x4 zmm13, zmm13, xmmword ptr [r12+r14*1-0x10], 0x03 +4: vshufps zmm6, zmm10, zmm11, 0x88 vshufps zmm7, zmm10, zmm11, 0xDD - vmovdqu32 zmm10, zmmword ptr [r9+r14*1-0x20] - vinserti32x4 zmm10, zmm10, xmmword ptr [r10+r14*1-0x20], 0x01 - vinserti32x4 zmm10, zmm10, xmmword ptr [r11+r14*1-0x20], 0x02 - vinserti32x4 zmm10, zmm10, xmmword ptr [r12+r14*1-0x20], 0x03 - vmovdqu32 zmm11, zmmword ptr [r9+r14*1-0x10] - vinserti32x4 zmm11, zmm11, xmmword ptr [r10+r14*1-0x10], 0x01 - vinserti32x4 zmm11, zmm11, xmmword ptr [r11+r14*1-0x10], 0x02 - vinserti32x4 zmm11, zmm11, xmmword ptr [r12+r14*1-0x10], 0x03 - vshufps zmm8, zmm10, zmm11, 0x88 - vshufps zmm9, zmm10, zmm11, 0xDD + vshufps zmm8, zmm12, zmm13, 0x88 + vshufps zmm9, zmm12, zmm13, 0xDD vpshufd zmm8, zmm8, 0x93 vpshufd zmm9, zmm9, 0x93 mov r15b, 0x07 @@ -850,24 +928,25 @@ blake3_hash_many_avx512: vpshufd zmm2, zmm2, 0x93 dec r15b jz 4f - vshufps zmm12, zmm6, zmm7, 0xD6 - vpshufd zmm13, zmm6, 0x0F - vpshufd zmm6, zmm12, 0x39 - vshufps zmm12, zmm8, zmm9, 0xFA - vpblendmd zmm13 {k2}, zmm13, zmm12 - vpunpcklqdq zmm12, zmm9, zmm7 - vpblendmd zmm12 {k3}, zmm12, zmm8 - vpshufd zmm12, zmm12, 0x78 + vshufps zmm14, zmm6, zmm7, 0xD6 + vpshufd zmm15, zmm6, 0x0F + vpshufd zmm6, zmm14, 0x39 + vshufps zmm14, zmm8, zmm9, 0xFA + vpblendmd zmm15 {k2}, zmm15, zmm14 + vpunpcklqdq zmm14, zmm9, zmm7 + vpblendmd zmm14 {k3}, zmm14, zmm8 + vpshufd zmm14, zmm14, 0x78 vpunpckhdq zmm7, zmm7, zmm9 vpunpckldq zmm8, zmm8, zmm7 vpshufd zmm9, zmm8, 0x1E - vmovdqa32 zmm7, zmm13 - vmovdqa32 zmm8, zmm12 + vmovdqa32 zmm7, zmm15 + vmovdqa32 zmm8, zmm14 jmp 4b 4: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov r13d, ebx + cmp r14, rax jb 2b vmovdqu xmmword ptr [rdx], xmm0 vmovdqu xmmword ptr [rdx+0x10], xmm1 @@ -875,28 +954,33 @@ blake3_hash_many_avx512: vextracti128 xmmword ptr [rdx+0x30], ymm1, 0x01 vextracti32x4 xmmword ptr [rdx+0x40], zmm0, 0x02 vextracti32x4 xmmword ptr [rdx+0x50], zmm1, 0x02 + cmp sil, 0x04 + jb 4f vextracti32x4 xmmword ptr [rdx+0x60], zmm0, 0x03 vextracti32x4 xmmword ptr [rdx+0x70], zmm1, 0x03 - lea r15, qword ptr [rax+0x10] - kortestw k1, k1 - cmovnz rax, r15 - add rdx, 0x80 - add rdi, 0x20 +4: + jmp 9b 3: - test sil, 0x02 - jz 3f + test sil, sil + jz 9b vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vbroadcasti128 ymm4, xmmword ptr [rip+BLAKE3_IV] - vmovd xmm5, dword ptr [rax] - vpinsrd xmm5, xmm5, dword ptr [rax+0x40], 0x01 - vpinsrd xmm5, xmm5, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02 - vmovd xmm6, dword ptr [rax+0x04] - vpinsrd xmm6, xmm6, dword ptr [rax+0x44], 0x01 - vpinsrd xmm6, xmm6, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02 - vinserti128 ymm5, ymm5, xmm6, 0x01 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x40] + mov r9d, 0x40 + vpbroadcastq ymm5, r9 + mov r9d, 0x55 + kmovw k4, r9d + vpunpckldq xmm8, xmm6, xmm7 + vpunpckhdq xmm7, xmm6, xmm7 + vinserti128 ymm8, ymm8, xmm7, 0x01 + vpermq ymm5 {k4}, ymm8, 0xDC mov r9, qword ptr [rdi] + cmp sil, 0x02 + jb 4f mov r10, qword ptr [rdi+0x08] +4: mov r11d, ebx movzx r12d, byte ptr [rbp+0x40] or r11d, r12d @@ -905,24 +989,26 @@ blake3_hash_many_avx512: movzx r13d, byte ptr [rbp+0x48] or r13d, r11d add r12, 0x40 - cmp r12, qword ptr [rsp+0x100] + cmp r12, rax cmovz r11d, r13d mov dword ptr [rsp+0x80], r11d vmovdqa ymm2, ymm4 - vpbroadcastd ymm6, dword ptr [rsp+0x80] - vpblendd ymm3, ymm5, ymm6, 0x88 + vpblendmd ymm3 {k3}, ymm5, dword ptr [rsp+0x80] {1to8} vmovdqu ymm10, ymmword ptr [r9+r12*1-0x40] - vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x40], 0x01 vmovdqu ymm11, ymmword ptr [r9+r12*1-0x30] + vmovdqu ymm12, ymmword ptr [r9+r12*1-0x20] + vmovdqu ymm13, ymmword ptr [r9+r12*1-0x10] + cmp sil, 0x02 + jb 4f + vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x40], 0x01 vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-0x30], 0x01 + vinserti128 ymm12, ymm12, xmmword ptr [r10+r12*1-0x20], 0x01 + vinserti128 ymm13, ymm13, xmmword ptr [r10+r12*1-0x10], 0x01 +4: vshufps ymm6, ymm10, ymm11, 0x88 vshufps ymm7, ymm10, ymm11, 0xDD - vmovdqu ymm10, ymmword ptr [r9+r12*1-0x20] - vinserti128 ymm10, ymm10, xmmword ptr [r10+r12*1-0x20], 0x01 - vmovdqu ymm11, ymmword ptr [r9+r12*1-0x10] - vinserti128 ymm11, ymm11, xmmword ptr [r10+r12*1-0x10], 0x01 - vshufps ymm8, ymm10, ymm11, 0x88 - vshufps ymm9, ymm10, ymm11, 0xDD + vshufps ymm8, ymm12, ymm13, 0x88 + vshufps ymm9, ymm12, ymm13, 0xDD vpshufd ymm8, ymm8, 0x93 vpshufd ymm9, ymm9, 0x93 mov r13b, 0x07 @@ -981,107 +1067,15 @@ blake3_hash_many_avx512: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov r11d, ebx + cmp r12, rax jb 2b vmovdqu xmmword ptr [rdx], xmm0 vmovdqu xmmword ptr [rdx+0x10], xmm1 + cmp sil, 0x02 + jb 4f vextracti128 xmmword ptr [rdx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rdx+0x30], ymm1, 0x01 - lea r13, qword ptr [rax+0x08] - kortestw k1, k1 - cmovnz rax, r13 - add rdx, 0x40 - add rdi, 0x10 -3: - test sil, 0x01 - jz 9b - vmovdqu xmm0, xmmword ptr [rcx] - vmovdqu xmm1, xmmword ptr [rcx+0x10] - vmovdqa xmm4, xmmword ptr [rip+BLAKE3_IV] - vmovd xmm5, dword ptr [rax] - vpinsrd xmm5, xmm5, dword ptr [rax+0x40], 0x01 - vpinsrd xmm5, xmm5, dword ptr [rip+BLAKE3_BLOCK_LEN], 0x02 - mov r9, qword ptr [rdi] - mov r10d, ebx - movzx r11d, byte ptr [rbp+0x40] - or r10d, r11d - xor r11d, r11d -2: - movzx r12d, byte ptr [rbp+0x48] - or r12d, r10d - add r11, 0x40 - cmp r11, qword ptr [rsp+0x100] - cmovz r10d, r12d - vmovdqa xmm2, xmm4 - vpinsrd xmm3, xmm5, r10d, 0x03 - vmovdqu xmm10, xmmword ptr [r9+r11*1-0x40] - vmovdqu xmm11, xmmword ptr [r9+r11*1-0x30] - vshufps xmm6, xmm10, xmm11, 0x88 - vshufps xmm7, xmm10, xmm11, 0xDD - vmovdqu xmm10, xmmword ptr [r9+r11*1-0x20] - vmovdqu xmm11, xmmword ptr [r9+r11*1-0x10] - vshufps xmm8, xmm10, xmm11, 0x88 - vshufps xmm9, xmm10, xmm11, 0xDD - vpshufd xmm8, xmm8, 0x93 - vpshufd xmm9, xmm9, 0x93 - mov r12b, 0x07 -4: - vpaddd xmm0, xmm0, xmm6 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 0x10 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 0x0C - vpaddd xmm0, xmm0, xmm7 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 0x08 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 0x07 - vpshufd xmm0, xmm0, 0x93 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x39 - vpaddd xmm0, xmm0, xmm8 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 0x10 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 0x0C - vpaddd xmm0, xmm0, xmm9 - vpaddd xmm0, xmm0, xmm1 - vpxord xmm3, xmm3, xmm0 - vprord xmm3, xmm3, 0x08 - vpaddd xmm2, xmm2, xmm3 - vpxord xmm1, xmm1, xmm2 - vprord xmm1, xmm1, 0x07 - vpshufd xmm0, xmm0, 0x39 - vpshufd xmm3, xmm3, 0x4E - vpshufd xmm2, xmm2, 0x93 - dec r12b - jz 4f - vshufps xmm10, xmm6, xmm7, 0xD6 - vpshufd xmm11, xmm6, 0x0F - vpshufd xmm6, xmm10, 0x39 - vshufps xmm10, xmm8, xmm9, 0xFA - vpblendd xmm11, xmm11, xmm10, 0xAA - vpunpcklqdq xmm10, xmm9, xmm7 - vpblendd xmm10, xmm10, xmm8, 0x88 - vpshufd xmm10, xmm10, 0x78 - vpunpckhdq xmm7, xmm7, xmm9 - vpunpckldq xmm8, xmm8, xmm7 - vpshufd xmm9, xmm8, 0x1E - vmovdqa xmm7, xmm11 - vmovdqa xmm8, xmm10 - jmp 4b 4: - vpxor xmm0, xmm0, xmm2 - vpxor xmm1, xmm1, xmm3 - mov r10d, ebx - jb 2b - vmovdqu xmmword ptr [rdx], xmm0 - vmovdqu xmmword ptr [rdx+0x10], xmm1 jmp 9b .p2align 6 @@ -1658,8 +1652,8 @@ _blake3_xof_many_avx512: vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+0x40], zmm1 add r9, 0x400 - cmp rax, 0x18 - lea rax, qword ptr [rax-0x10] + sub rax, 0x10 + cmp rax, 0x08 jnbe 3b test al, al jnz 2f |
