aboutsummaryrefslogtreecommitdiff
path: root/src/c
diff options
context:
space:
mode:
Diffstat (limited to 'src/c')
-rw-r--r--src/c/blake3.h3
-rw-r--r--src/c/blake3_avx512.c283
-rw-r--r--src/c/blake3_impl.h79
-rw-r--r--src/c/blake3_neon.c5
4 files changed, 172 insertions, 198 deletions
diff --git a/src/c/blake3.h b/src/c/blake3.h
index 5c68521..c3cf6be 100644
--- a/src/c/blake3.h
+++ b/src/c/blake3.h
@@ -28,8 +28,7 @@ typedef struct {
void blake3_hasher_init(blake3_hasher *self);
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
-void blake3_hasher_init_derive_key(blake3_hasher *self,
- const uint8_t key[BLAKE3_KEY_LEN]);
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c
index 2c8657c..fc754e2 100644
--- a/src/c/blake3_avx512.c
+++ b/src/c/blake3_avx512.c
@@ -2,6 +2,10 @@
#include <immintrin.h>
+#define _mm_shuffle_ps2(a, b, c) \
+ (_mm_castps_si128( \
+ _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
+
INLINE __m128i loadu_128(const uint8_t src[16]) {
return _mm_loadu_si128((const __m128i *)src);
}
@@ -123,159 +127,160 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
__m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
__m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
- __m128i buf, t0, t1, t2;
+ __m128i t0, t1, t2, t3, tt;
- // round 1
- buf = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0)));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- buf = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(3, 1, 3, 1)));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ // Round 1. The first round permutes the message words from the original
+ // input order, into the groups that get mixed in parallel.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3, 2, 0, 1));
- t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 3, 2));
- buf = _mm_blend_epi16(t0, t1, 0xC3);
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_blend_epi16(t0, t1, 0x3C);
- buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2, 3, 0, 1));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
+ t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
+ t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
-
- // round 2
- t0 = _mm_blend_epi16(m1, m2, 0x0C);
- t1 = _mm_slli_si128(m3, 4);
- t2 = _mm_blend_epi16(t0, t1, 0xF0);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 0, 2, 0));
- t1 = _mm_blend_epi16(m1, m3, 0xC0);
- t2 = _mm_blend_epi16(t0, t1, 0xF0);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 2. This round and all following rounds apply a fixed permutation
+ // to the message words from the round before.
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_slli_si128(m1, 4);
- t1 = _mm_blend_epi16(m2, t0, 0x30);
- t2 = _mm_blend_epi16(m0, t1, 0xF0);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_unpackhi_epi32(m0, m1);
- t1 = _mm_slli_si128(m3, 4);
- t2 = _mm_blend_epi16(t0, t1, 0x0C);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
-
- // round 3
- t0 = _mm_unpackhi_epi32(m2, m3);
- t1 = _mm_blend_epi16(m3, m1, 0x0C);
- t2 = _mm_blend_epi16(t0, t1, 0x0F);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_unpacklo_epi32(m2, m0);
- t1 = _mm_blend_epi16(t0, m0, 0xF0);
- t2 = _mm_slli_si128(m3, 8);
- buf = _mm_blend_epi16(t1, t2, 0xC0);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 3
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_blend_epi16(m0, m2, 0x3C);
- t1 = _mm_srli_si128(m1, 12);
- t2 = _mm_blend_epi16(t0, t1, 0x03);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_slli_si128(m3, 4);
- t1 = _mm_blend_epi16(m0, m1, 0x33);
- t2 = _mm_blend_epi16(t1, t0, 0xC0);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 2, 3, 0));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
-
- // round 4
- t0 = _mm_unpackhi_epi32(m0, m1);
- t1 = _mm_unpackhi_epi32(t0, m2);
- t2 = _mm_blend_epi16(t1, m3, 0x0C);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_slli_si128(m2, 8);
- t1 = _mm_blend_epi16(m3, m0, 0x0C);
- t2 = _mm_blend_epi16(t1, t0, 0xC0);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 4
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_blend_epi16(m0, m1, 0x0F);
- t1 = _mm_blend_epi16(t0, m3, 0xC0);
- buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_alignr_epi8(m0, m1, 4);
- buf = _mm_blend_epi16(t0, m2, 0x33);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
-
- // round 5
- t0 = _mm_unpacklo_epi64(m1, m2);
- t1 = _mm_unpackhi_epi64(m0, m2);
- t2 = _mm_blend_epi16(t0, t1, 0x33);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_unpackhi_epi64(m1, m3);
- t1 = _mm_unpacklo_epi64(m0, m1);
- buf = _mm_blend_epi16(t0, t1, 0x33);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 5
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_unpackhi_epi64(m3, m1);
- t1 = _mm_unpackhi_epi64(m2, m0);
- t2 = _mm_blend_epi16(t1, t0, 0x33);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_blend_epi16(m0, m2, 0x03);
- t1 = _mm_slli_si128(t0, 8);
- t2 = _mm_blend_epi16(t1, m3, 0x0F);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 3, 1));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
-
- // round 6
- t0 = _mm_unpackhi_epi32(m0, m1);
- t1 = _mm_unpacklo_epi32(m0, m2);
- buf = _mm_unpacklo_epi64(t0, t1);
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_srli_si128(m2, 4);
- t1 = _mm_blend_epi16(m0, m3, 0x03);
- buf = _mm_blend_epi16(t1, t0, 0x3C);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 6
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_blend_epi16(m1, m0, 0x0C);
- t1 = _mm_srli_si128(m3, 4);
- t2 = _mm_blend_epi16(t0, t1, 0x30);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_unpacklo_epi64(m2, m1);
- t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2, 0, 1, 0));
- t2 = _mm_srli_si128(t0, 4);
- buf = _mm_blend_epi16(t1, t2, 0x33);
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
-
- // round 7
- t0 = _mm_slli_si128(m1, 12);
- t1 = _mm_blend_epi16(m0, m3, 0x33);
- buf = _mm_blend_epi16(t1, t0, 0xC0);
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_blend_epi16(m3, m2, 0x30);
- t1 = _mm_srli_si128(m1, 4);
- t2 = _mm_blend_epi16(t0, t1, 0x03);
- buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 3, 0));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ m0 = t0;
+ m1 = t1;
+ m2 = t2;
+ m3 = t3;
+
+ // Round 7
+ t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
+ t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
+ t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
+ tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
+ t1 = _mm_blend_epi16(tt, t1, 0xCC);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
- t0 = _mm_unpacklo_epi64(m0, m2);
- t1 = _mm_srli_si128(m1, 4);
- buf =
- _mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE(3, 1, 0, 2));
- g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
- t0 = _mm_unpackhi_epi32(m1, m2);
- t1 = _mm_unpackhi_epi64(m0, t0);
- buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3));
- g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ t2 = _mm_unpacklo_epi64(m3, m1);
+ tt = _mm_blend_epi16(t2, m2, 0xC0);
+ t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
+ t3 = _mm_unpackhi_epi32(m1, m3);
+ tt = _mm_unpacklo_epi32(m2, t3);
+ t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
}
diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h
index 9a44391..576ccf4 100644
--- a/src/c/blake3_impl.h
+++ b/src/c/blake3_impl.h
@@ -18,7 +18,8 @@
#define PARENT 4
#define ROOT 8
#define KEYED_HASH 16
-#define DERIVE_KEY 32
+#define DERIVE_KEY_CONTEXT 32
+#define DERIVE_KEY_MATERIAL 64
// This C implementation tries to support recent versions of GCC, Clang, and
// MSVC.
@@ -34,12 +35,12 @@ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
static const uint8_t MSG_SCHEDULE[7][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
- {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
- {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
- {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
- {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
- {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
- {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
};
// Count the number of 1 bits.
@@ -80,53 +81,17 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
key_words[7] = load32(&key[7 * 4]);
}
-// Declarations for implementation-specific functions.
-void blake3_compress_in_place_portable(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-void blake3_compress_in_place_sse41(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-void blake3_compress_in_place_avx512(uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags);
-void blake3_compress_xof_portable(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-void blake3_compress_xof_sse41(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-void blake3_compress_xof_avx512(const uint32_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t counter,
- uint8_t flags, uint8_t out[64]);
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
-void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint32_t key[8],
- uint64_t counter, bool increment_counter,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out);
+void blake3_compress_in_place(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+void blake3_compress_xof(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter, uint8_t flags,
+ uint8_t out[64]);
+
+void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
+ size_t blocks, const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c
index 7335c19..46691f5 100644
--- a/src/c/blake3_neon.c
+++ b/src/c/blake3_neon.c
@@ -290,6 +290,11 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
* ----------------------------------------------------------------------------
*/
+void blake3_compress_in_place_portable(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start, uint8_t flags_end,