diff options
Diffstat (limited to 'src/c')
| -rw-r--r-- | src/c/blake3.h | 3 | ||||
| -rw-r--r-- | src/c/blake3_avx512.c | 283 | ||||
| -rw-r--r-- | src/c/blake3_impl.h | 79 | ||||
| -rw-r--r-- | src/c/blake3_neon.c | 5 |
4 files changed, 172 insertions, 198 deletions
diff --git a/src/c/blake3.h b/src/c/blake3.h index 5c68521..c3cf6be 100644 --- a/src/c/blake3.h +++ b/src/c/blake3.h @@ -28,8 +28,7 @@ typedef struct { void blake3_hasher_init(blake3_hasher *self); void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c index 2c8657c..fc754e2 100644 --- a/src/c/blake3_avx512.c +++ b/src/c/blake3_avx512.c @@ -2,6 +2,10 @@ #include <immintrin.h> +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + INLINE __m128i loadu_128(const uint8_t src[16]) { return _mm_loadu_si128((const __m128i *)src); } @@ -123,159 +127,160 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); - __m128i buf, t0, t1, t2; + __m128i t0, t1, t2, t3, tt; - // round 1 - buf = _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0))); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - buf = _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(3, 1, 3, 1))); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3, 2, 0, 1)); - t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 3, 2)); - buf = _mm_blend_epi16(t0, t1, 0xC3); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_blend_epi16(t0, t1, 0x3C); - buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2, 3, 0, 1)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 2 - t0 = _mm_blend_epi16(m1, m2, 0x0C); - t1 = _mm_slli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0xF0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 0, 2, 0)); - t1 = _mm_blend_epi16(m1, m3, 0xC0); - t2 = _mm_blend_epi16(t0, t1, 0xF0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_slli_si128(m1, 4); - t1 = _mm_blend_epi16(m2, t0, 0x30); - t2 = _mm_blend_epi16(m0, t1, 0xF0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpackhi_epi32(m0, m1); - t1 = _mm_slli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0x0C); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 3 - t0 = _mm_unpackhi_epi32(m2, m3); - t1 = _mm_blend_epi16(m3, m1, 0x0C); - t2 = _mm_blend_epi16(t0, t1, 0x0F); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpacklo_epi32(m2, m0); - t1 = _mm_blend_epi16(t0, m0, 0xF0); - t2 = _mm_slli_si128(m3, 8); - buf = _mm_blend_epi16(t1, t2, 0xC0); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_blend_epi16(m0, m2, 0x3C); - t1 = _mm_srli_si128(m1, 12); - t2 = _mm_blend_epi16(t0, t1, 0x03); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_slli_si128(m3, 4); - t1 = _mm_blend_epi16(m0, m1, 0x33); - t2 = _mm_blend_epi16(t1, t0, 0xC0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 2, 3, 0)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 4 - t0 = _mm_unpackhi_epi32(m0, m1); - t1 = _mm_unpackhi_epi32(t0, m2); - t2 = _mm_blend_epi16(t1, m3, 0x0C); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_slli_si128(m2, 8); - t1 = _mm_blend_epi16(m3, m0, 0x0C); - t2 = _mm_blend_epi16(t1, t0, 0xC0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_blend_epi16(m0, m1, 0x0F); - t1 = _mm_blend_epi16(t0, m3, 0xC0); - buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_alignr_epi8(m0, m1, 4); - buf = _mm_blend_epi16(t0, m2, 0x33); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 5 - t0 = _mm_unpacklo_epi64(m1, m2); - t1 = _mm_unpackhi_epi64(m0, m2); - t2 = _mm_blend_epi16(t0, t1, 0x33); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpackhi_epi64(m1, m3); - t1 = _mm_unpacklo_epi64(m0, m1); - buf = _mm_blend_epi16(t0, t1, 0x33); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_unpackhi_epi64(m3, m1); - t1 = _mm_unpackhi_epi64(m2, m0); - t2 = _mm_blend_epi16(t1, t0, 0x33); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_blend_epi16(m0, m2, 0x03); - t1 = _mm_slli_si128(t0, 8); - t2 = _mm_blend_epi16(t1, m3, 0x0F); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 3, 1)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 6 - t0 = _mm_unpackhi_epi32(m0, m1); - t1 = _mm_unpacklo_epi32(m0, m2); - buf = _mm_unpacklo_epi64(t0, t1); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_srli_si128(m2, 4); - t1 = _mm_blend_epi16(m0, m3, 0x03); - buf = _mm_blend_epi16(t1, t0, 0x3C); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_blend_epi16(m1, m0, 0x0C); - t1 = _mm_srli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0x30); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpacklo_epi64(m2, m1); - t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2, 0, 1, 0)); - t2 = _mm_srli_si128(t0, 4); - buf = _mm_blend_epi16(t1, t2, 0x33); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 7 - t0 = _mm_slli_si128(m1, 12); - t1 = _mm_blend_epi16(m0, m3, 0x33); - buf = _mm_blend_epi16(t1, t0, 0xC0); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_blend_epi16(m3, m2, 0x30); - t1 = _mm_srli_si128(m1, 4); - t2 = _mm_blend_epi16(t0, t1, 0x03); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 3, 0)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_unpacklo_epi64(m0, m2); - t1 = _mm_srli_si128(m1, 4); - buf = - _mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE(3, 1, 0, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpackhi_epi32(m1, m2); - t1 = _mm_unpackhi_epi64(m0, t0); - buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h index 9a44391..576ccf4 100644 --- a/src/c/blake3_impl.h +++ b/src/c/blake3_impl.h @@ -18,7 +18,8 @@ #define PARENT 4 #define ROOT 8 #define KEYED_HASH 16 -#define DERIVE_KEY 32 +#define DERIVE_KEY_CONTEXT 32 +#define DERIVE_KEY_MATERIAL 64 // This C implementation tries to support recent versions of GCC, Clang, and // MSVC. @@ -34,12 +35,12 @@ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, static const uint8_t MSG_SCHEDULE[7][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, - {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, - {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, }; // Count the number of 1 bits. @@ -80,53 +81,17 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], key_words[7] = load32(&key[7 * 4]); } -// Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c index 7335c19..46691f5 100644 --- a/src/c/blake3_neon.c +++ b/src/c/blake3_neon.c @@ -290,6 +290,11 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, * ---------------------------------------------------------------------------- */ +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + INLINE void hash_one_neon(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, |
