aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Neves <[email protected]>2020-04-12 11:36:10 +0100
committerSamuel Neves <[email protected]>2020-04-12 11:38:11 +0100
commit7ef795d62effe2f59e7aff6d4bbba81e398d23e3 (patch)
tree1d7825715f8ef3803ba0af3cf75280d6fb3fe804
parent370ba3644ae2b38b91e52033fc7e9ae705920495 (diff)
Do not require AVX512DQ
Whereas vinserti64x4 is present on AVX512F, vinserti32x8 requires AVX512DQ, which we do not test for. At this point there is not much risk of incompatibility, since Skylake-X chips have all the requires instruction sets, but let's be precise about this.
-rw-r--r--c/blake3_avx512_x86-64_unix.S34
-rw-r--r--c/blake3_avx512_x86-64_windows_gnu.S34
-rw-r--r--c/blake3_avx512_x86-64_windows_msvc.asm34
-rw-r--r--c/blake3_dispatch.c4
4 files changed, 53 insertions, 53 deletions
diff --git a/c/blake3_avx512_x86-64_unix.S b/c/blake3_avx512_x86-64_unix.S
index e5b9b05..25612cb 100644
--- a/c/blake3_avx512_x86-64_unix.S
+++ b/c/blake3_avx512_x86-64_unix.S
@@ -82,15 +82,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+0x20]
@@ -102,15 +102,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -144,15 +144,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@@ -172,15 +172,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@@ -2039,7 +2039,7 @@ blake3_hash_many_avx512:
vpermq ymm14, ymm14, 0xDC
vpermq ymm15, ymm15, 0xDC
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
diff --git a/c/blake3_avx512_x86-64_windows_gnu.S b/c/blake3_avx512_x86-64_windows_gnu.S
index 6ce5629..e10b9f3 100644
--- a/c/blake3_avx512_x86-64_windows_gnu.S
+++ b/c/blake3_avx512_x86-64_windows_gnu.S
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+0x20]
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 [r8+rdx+0x80]
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
vpermq ymm14, ymm14, 0xDC
vpermq ymm15, ymm15, 0xDC
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
diff --git a/c/blake3_avx512_x86-64_windows_msvc.asm b/c/blake3_avx512_x86-64_windows_msvc.asm
index 33fdb85..b19efba 100644
--- a/c/blake3_avx512_x86-64_windows_msvc.asm
+++ b/c/blake3_avx512_x86-64_windows_msvc.asm
@@ -99,15 +99,15 @@ innerloop16:
mov r14, qword ptr [rdi+50H]
mov r15, qword ptr [rdi+58H]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
vpunpcklqdq zmm8, zmm16, zmm17
vpunpckhqdq zmm9, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
mov r8, qword ptr [rdi+20H]
@@ -119,15 +119,15 @@ innerloop16:
mov r14, qword ptr [rdi+70H]
mov r15, qword ptr [rdi+78H]
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
vpunpcklqdq zmm12, zmm16, zmm17
vpunpckhqdq zmm13, zmm16, zmm17
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
vpunpcklqdq zmm14, zmm18, zmm19
vpunpckhqdq zmm15, zmm18, zmm19
vmovdqa32 zmm27, zmmword ptr [INDEX0]
@@ -161,15 +161,15 @@ innerloop16:
mov r14, qword ptr [rdi+50H]
mov r15, qword ptr [rdi+58H]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
vpunpcklqdq zmm8, zmm24, zmm25
vpunpckhqdq zmm9, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
vpunpcklqdq zmm10, zmm24, zmm25
vpunpckhqdq zmm11, zmm24, zmm25
prefetcht0 byte ptr [r8+rdx+80H]
@@ -189,15 +189,15 @@ innerloop16:
mov r14, qword ptr [rdi+70H]
mov r15, qword ptr [rdi+78H]
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
vpunpcklqdq zmm12, zmm24, zmm25
vpunpckhqdq zmm13, zmm24, zmm25
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
vpunpcklqdq zmm14, zmm24, zmm25
vpunpckhqdq zmm15, zmm24, zmm25
prefetcht0 byte ptr [r8+rdx+80H]
@@ -2073,7 +2073,7 @@ final7blocks:
vpermq ymm14, ymm14, 0DCH
vpermq ymm15, ymm15, 0DCH
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
- vinserti32x8 zmm13, zmm14, ymm15, 01H
+ vinserti64x4 zmm13, zmm14, ymm15, 01H
mov eax, 17476
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c
index add8bef..6847725 100644
--- a/c/blake3_dispatch.c
+++ b/c/blake3_dispatch.c
@@ -182,7 +182,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
- if (features & AVX512F) {
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
@@ -223,7 +223,7 @@ size_t blake3_simd_degree(void) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
#if !defined(BLAKE3_NO_AVX512)
- if (features & AVX512F) {
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
return 16;
}
#endif