diff options
| author | Samuel Neves <[email protected]> | 2020-04-12 11:36:10 +0100 |
|---|---|---|
| committer | Samuel Neves <[email protected]> | 2020-04-12 11:38:11 +0100 |
| commit | 7ef795d62effe2f59e7aff6d4bbba81e398d23e3 (patch) | |
| tree | 1d7825715f8ef3803ba0af3cf75280d6fb3fe804 | |
| parent | 370ba3644ae2b38b91e52033fc7e9ae705920495 (diff) | |
Do not require AVX512DQ
Whereas vinserti64x4 is present on AVX512F, vinserti32x8 requires
AVX512DQ, which we do not test for. At this point there is not
much risk of incompatibility, since Skylake-X chips have all the
requires instruction sets, but let's be precise about this.
| -rw-r--r-- | c/blake3_avx512_x86-64_unix.S | 34 | ||||
| -rw-r--r-- | c/blake3_avx512_x86-64_windows_gnu.S | 34 | ||||
| -rw-r--r-- | c/blake3_avx512_x86-64_windows_msvc.asm | 34 | ||||
| -rw-r--r-- | c/blake3_dispatch.c | 4 |
4 files changed, 53 insertions, 53 deletions
diff --git a/c/blake3_avx512_x86-64_unix.S b/c/blake3_avx512_x86-64_unix.S index e5b9b05..25612cb 100644 --- a/c/blake3_avx512_x86-64_unix.S +++ b/c/blake3_avx512_x86-64_unix.S @@ -82,15 +82,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] @@ -102,15 +102,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] @@ -144,15 +144,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] @@ -172,15 +172,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] @@ -2039,7 +2039,7 @@ blake3_hash_many_avx512: vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti32x8 zmm13, zmm14, ymm15, 0x01 + vinserti64x4 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 diff --git a/c/blake3_avx512_x86-64_windows_gnu.S b/c/blake3_avx512_x86-64_windows_gnu.S index 6ce5629..e10b9f3 100644 --- a/c/blake3_avx512_x86-64_windows_gnu.S +++ b/c/blake3_avx512_x86-64_windows_gnu.S @@ -96,15 +96,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] @@ -116,15 +116,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] - vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] - vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] - vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] - vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] @@ -158,15 +158,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] @@ -186,15 +186,15 @@ blake3_hash_many_avx512: mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] - vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] - vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] @@ -2065,7 +2065,7 @@ blake3_hash_many_avx512: vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] - vinserti32x8 zmm13, zmm14, ymm15, 0x01 + vinserti64x4 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 diff --git a/c/blake3_avx512_x86-64_windows_msvc.asm b/c/blake3_avx512_x86-64_windows_msvc.asm index 33fdb85..b19efba 100644 --- a/c/blake3_avx512_x86-64_windows_msvc.asm +++ b/c/blake3_avx512_x86-64_windows_msvc.asm @@ -99,15 +99,15 @@ innerloop16: mov r14, qword ptr [rdi+50H] mov r15, qword ptr [rdi+58H] vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] - vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] - vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] - vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] - vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+20H] @@ -119,15 +119,15 @@ innerloop16: mov r14, qword ptr [rdi+70H] mov r15, qword ptr [rdi+78H] vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] - vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] - vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] - vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] - vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0] @@ -161,15 +161,15 @@ innerloop16: mov r14, qword ptr [rdi+50H] mov r15, qword ptr [rdi+58H] vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] - vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] - vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] - vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] - vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 byte ptr [r8+rdx+80H] @@ -189,15 +189,15 @@ innerloop16: mov r14, qword ptr [rdi+70H] mov r15, qword ptr [rdi+78H] vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] - vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] - vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] - vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] - vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 byte ptr [r8+rdx+80H] @@ -2073,7 +2073,7 @@ final7blocks: vpermq ymm14, ymm14, 0DCH vpermq ymm15, ymm15, 0DCH vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] - vinserti32x8 zmm13, zmm14, ymm15, 01H + vinserti64x4 zmm13, zmm14, ymm15, 01H mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index add8bef..6847725 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -182,7 +182,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(BLAKE3_NO_AVX512) - if (features & AVX512F) { + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); @@ -223,7 +223,7 @@ size_t blake3_simd_degree(void) { #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); #if !defined(BLAKE3_NO_AVX512) - if (features & AVX512F) { + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { return 16; } #endif |
