diff options
| author | Jack O'Connor <[email protected]> | 2020-01-03 12:51:04 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2020-01-04 22:14:21 -0500 |
| commit | a8bff48111eea3212781638e3959a0a7227ddc13 (patch) | |
| tree | b6518057c81242f7cac5ea2f615f9704e1f9db1b /src | |
| parent | 72ba63ca86b3aa1d7052331641767614717c9ac7 (diff) | |
WIP switch to new permutationpermutation
Diffstat (limited to 'src')
| -rw-r--r-- | src/c/blake3.h | 3 | ||||
| -rw-r--r-- | src/c/blake3_avx512.c | 283 | ||||
| -rw-r--r-- | src/c/blake3_impl.h | 79 | ||||
| -rw-r--r-- | src/c/blake3_neon.c | 5 | ||||
| -rw-r--r-- | src/lib.rs | 12 | ||||
| -rw-r--r-- | src/sse41.rs | 412 |
6 files changed, 390 insertions, 404 deletions
diff --git a/src/c/blake3.h b/src/c/blake3.h index 5c68521..c3cf6be 100644 --- a/src/c/blake3.h +++ b/src/c/blake3.h @@ -28,8 +28,7 @@ typedef struct { void blake3_hasher_init(blake3_hasher *self); void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); -void blake3_hasher_init_derive_key(blake3_hasher *self, - const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c index 2c8657c..fc754e2 100644 --- a/src/c/blake3_avx512.c +++ b/src/c/blake3_avx512.c @@ -2,6 +2,10 @@ #include <immintrin.h> +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + INLINE __m128i loadu_128(const uint8_t src[16]) { return _mm_loadu_si128((const __m128i *)src); } @@ -123,159 +127,160 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); - __m128i buf, t0, t1, t2; + __m128i t0, t1, t2, t3, tt; - // round 1 - buf = _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0))); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - buf = _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(3, 1, 3, 1))); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3, 2, 0, 1)); - t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 3, 2)); - buf = _mm_blend_epi16(t0, t1, 0xC3); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_blend_epi16(t0, t1, 0x3C); - buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2, 3, 0, 1)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 2 - t0 = _mm_blend_epi16(m1, m2, 0x0C); - t1 = _mm_slli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0xF0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 0, 2, 0)); - t1 = _mm_blend_epi16(m1, m3, 0xC0); - t2 = _mm_blend_epi16(t0, t1, 0xF0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_slli_si128(m1, 4); - t1 = _mm_blend_epi16(m2, t0, 0x30); - t2 = _mm_blend_epi16(m0, t1, 0xF0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpackhi_epi32(m0, m1); - t1 = _mm_slli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0x0C); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 3 - t0 = _mm_unpackhi_epi32(m2, m3); - t1 = _mm_blend_epi16(m3, m1, 0x0C); - t2 = _mm_blend_epi16(t0, t1, 0x0F); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpacklo_epi32(m2, m0); - t1 = _mm_blend_epi16(t0, m0, 0xF0); - t2 = _mm_slli_si128(m3, 8); - buf = _mm_blend_epi16(t1, t2, 0xC0); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_blend_epi16(m0, m2, 0x3C); - t1 = _mm_srli_si128(m1, 12); - t2 = _mm_blend_epi16(t0, t1, 0x03); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_slli_si128(m3, 4); - t1 = _mm_blend_epi16(m0, m1, 0x33); - t2 = _mm_blend_epi16(t1, t0, 0xC0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 2, 3, 0)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 4 - t0 = _mm_unpackhi_epi32(m0, m1); - t1 = _mm_unpackhi_epi32(t0, m2); - t2 = _mm_blend_epi16(t1, m3, 0x0C); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_slli_si128(m2, 8); - t1 = _mm_blend_epi16(m3, m0, 0x0C); - t2 = _mm_blend_epi16(t1, t0, 0xC0); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_blend_epi16(m0, m1, 0x0F); - t1 = _mm_blend_epi16(t0, m3, 0xC0); - buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_alignr_epi8(m0, m1, 4); - buf = _mm_blend_epi16(t0, m2, 0x33); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 5 - t0 = _mm_unpacklo_epi64(m1, m2); - t1 = _mm_unpackhi_epi64(m0, m2); - t2 = _mm_blend_epi16(t0, t1, 0x33); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpackhi_epi64(m1, m3); - t1 = _mm_unpacklo_epi64(m0, m1); - buf = _mm_blend_epi16(t0, t1, 0x33); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_unpackhi_epi64(m3, m1); - t1 = _mm_unpackhi_epi64(m2, m0); - t2 = _mm_blend_epi16(t1, t0, 0x33); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_blend_epi16(m0, m2, 0x03); - t1 = _mm_slli_si128(t0, 8); - t2 = _mm_blend_epi16(t1, m3, 0x0F); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 3, 1)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 6 - t0 = _mm_unpackhi_epi32(m0, m1); - t1 = _mm_unpacklo_epi32(m0, m2); - buf = _mm_unpacklo_epi64(t0, t1); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_srli_si128(m2, 4); - t1 = _mm_blend_epi16(m0, m3, 0x03); - buf = _mm_blend_epi16(t1, t0, 0x3C); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_blend_epi16(m1, m0, 0x0C); - t1 = _mm_srli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0x30); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpacklo_epi64(m2, m1); - t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2, 0, 1, 0)); - t2 = _mm_srli_si128(t0, 4); - buf = _mm_blend_epi16(t1, t2, 0x33); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); - - // round 7 - t0 = _mm_slli_si128(m1, 12); - t1 = _mm_blend_epi16(m0, m3, 0x33); - buf = _mm_blend_epi16(t1, t0, 0xC0); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_blend_epi16(m3, m2, 0x30); - t1 = _mm_srli_si128(m1, 4); - t2 = _mm_blend_epi16(t0, t1, 0x03); - buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 3, 0)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); - t0 = _mm_unpacklo_epi64(m0, m2); - t1 = _mm_srli_si128(m1, 4); - buf = - _mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE(3, 1, 0, 2)); - g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); - t0 = _mm_unpackhi_epi32(m1, m2); - t1 = _mm_unpackhi_epi64(m0, t0); - buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3)); - g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); undiagonalize(&rows[0], &rows[2], &rows[3]); } diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h index 9a44391..576ccf4 100644 --- a/src/c/blake3_impl.h +++ b/src/c/blake3_impl.h @@ -18,7 +18,8 @@ #define PARENT 4 #define ROOT 8 #define KEYED_HASH 16 -#define DERIVE_KEY 32 +#define DERIVE_KEY_CONTEXT 32 +#define DERIVE_KEY_MATERIAL 64 // This C implementation tries to support recent versions of GCC, Clang, and // MSVC. @@ -34,12 +35,12 @@ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, static const uint8_t MSG_SCHEDULE[7][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, - {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, - {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, }; // Count the number of 1 bits. @@ -80,53 +81,17 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], key_words[7] = load32(&key[7 * 4]); } -// Declarations for implementation-specific functions. -void blake3_compress_in_place_portable(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags); -void blake3_compress_xof_portable(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t counter, - uint8_t flags, uint8_t out[64]); -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); -void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint32_t key[8], - uint64_t counter, bool increment_counter, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out); +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c index 7335c19..46691f5 100644 --- a/src/c/blake3_neon.c +++ b/src/c/blake3_neon.c @@ -290,6 +290,11 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, * ---------------------------------------------------------------------------- */ +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + INLINE void hash_one_neon(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, @@ -97,12 +97,12 @@ const IV: &CVWords = &[ const MSG_SCHEDULE: [[usize; 16]; 7] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], - [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], - [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], - [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], - [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], - [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], ]; // These are the internal flags that we use to domain separate root/non-root, diff --git a/src/sse41.rs b/src/sse41.rs index 94f1272..70925c6 100644 --- a/src/sse41.rs +++ b/src/sse41.rs @@ -73,34 +73,34 @@ unsafe fn rot7(a: __m128i) -> __m128i { #[inline(always)] unsafe fn g1( + row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, - row4: &mut __m128i, m: __m128i, ) { - *row1 = add(add(*row1, m), *row2); - *row4 = xor(*row4, *row1); - *row4 = rot16(*row4); - *row3 = add(*row3, *row4); - *row2 = xor(*row2, *row3); - *row2 = rot12(*row2); + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( + row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, - row4: &mut __m128i, m: __m128i, ) { - *row1 = add(add(*row1, m), *row2); - *row4 = xor(*row4, *row1); - *row4 = rot8(*row4); - *row3 = add(*row3, *row4); - *row2 = xor(*row2, *row3); - *row2 = rot7(*row2); + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. @@ -110,21 +110,31 @@ macro_rules! _MM_SHUFFLE { }; } -// Note the optimization here of leaving row2 as the unrotated row, rather than -// row1. All the message loads below are adjusted to compensate for this. See +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] -unsafe fn diagonalize(row1: &mut __m128i, row3: &mut __m128i, row4: &mut __m128i) { - *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE!(2, 1, 0, 3)); - *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE!(1, 0, 3, 2)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(0, 3, 2, 1)); +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] -unsafe fn undiagonalize(row1: &mut __m128i, row3: &mut __m128i, row4: &mut __m128i) { - *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE!(0, 3, 2, 1)); - *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE!(1, 0, 3, 2)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(2, 1, 0, 3)); +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] @@ -135,180 +145,182 @@ unsafe fn compress_pre( counter: u64, flags: u8, ) -> [__m128i; 4] { - let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row4 = &mut set4( + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); - let m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - // round 1 - let buf = _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(m0), - _mm_castsi128_ps(m1), - _MM_SHUFFLE!(2, 0, 2, 0), - )); - g1(row1, row2, row3, row4, buf); - let buf = _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(m0), - _mm_castsi128_ps(m1), - _MM_SHUFFLE!(3, 1, 3, 1), - )); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE!(3, 2, 0, 1)); - let t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE!(0, 1, 3, 2)); - let buf = _mm_blend_epi16(t0, t1, 0xC3); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_blend_epi16(t0, t1, 0x3C); - let buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(2, 3, 0, 1)); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - // round 2 - let t0 = _mm_blend_epi16(m1, m2, 0x0C); - let t1 = _mm_slli_si128(m3, 4); - let t2 = _mm_blend_epi16(t0, t1, 0xF0); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE!(0, 0, 2, 0)); - let t1 = _mm_blend_epi16(m1, m3, 0xC0); - let t2 = _mm_blend_epi16(t0, t1, 0xF0); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 3, 0, 1)); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_slli_si128(m1, 4); - let t1 = _mm_blend_epi16(m2, t0, 0x30); - let t2 = _mm_blend_epi16(m0, t1, 0xF0); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(3, 0, 1, 2)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_unpackhi_epi32(m0, m1); - let t1 = _mm_slli_si128(m3, 4); - let t2 = _mm_blend_epi16(t0, t1, 0x0C); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(3, 0, 1, 2)); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - // round 3 - let t0 = _mm_unpackhi_epi32(m2, m3); - let t1 = _mm_blend_epi16(m3, m1, 0x0C); - let t2 = _mm_blend_epi16(t0, t1, 0x0F); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(3, 1, 0, 2)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_unpacklo_epi32(m2, m0); - let t1 = _mm_blend_epi16(t0, m0, 0xF0); - let t2 = _mm_slli_si128(m3, 8); - let buf = _mm_blend_epi16(t1, t2, 0xC0); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_blend_epi16(m0, m2, 0x3C); - let t1 = _mm_srli_si128(m1, 12); - let t2 = _mm_blend_epi16(t0, t1, 0x03); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_slli_si128(m3, 4); - let t1 = _mm_blend_epi16(m0, m1, 0x33); - let t2 = _mm_blend_epi16(t1, t0, 0xC0); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(1, 2, 3, 0)); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - // round 4 - let t0 = _mm_unpackhi_epi32(m0, m1); - let t1 = _mm_unpackhi_epi32(t0, m2); - let t2 = _mm_blend_epi16(t1, m3, 0x0C); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(3, 1, 0, 2)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_slli_si128(m2, 8); - let t1 = _mm_blend_epi16(m3, m0, 0x0C); - let t2 = _mm_blend_epi16(t1, t0, 0xC0); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 0, 1, 3)); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_blend_epi16(m0, m1, 0x0F); - let t1 = _mm_blend_epi16(t0, m3, 0xC0); - let buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE!(0, 1, 2, 3)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_alignr_epi8(m0, m1, 4); - let buf = _mm_blend_epi16(t0, m2, 0x33); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - // round 5 - let t0 = _mm_unpacklo_epi64(m1, m2); - let t1 = _mm_unpackhi_epi64(m0, m2); - let t2 = _mm_blend_epi16(t0, t1, 0x33); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 0, 1, 3)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_unpackhi_epi64(m1, m3); - let t1 = _mm_unpacklo_epi64(m0, m1); - let buf = _mm_blend_epi16(t0, t1, 0x33); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_unpackhi_epi64(m3, m1); - let t1 = _mm_unpackhi_epi64(m2, m0); - let t2 = _mm_blend_epi16(t1, t0, 0x33); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_blend_epi16(m0, m2, 0x03); - let t1 = _mm_slli_si128(t0, 8); - let t2 = _mm_blend_epi16(t1, m3, 0x0F); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 0, 3, 1)); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - // round 6 - let t0 = _mm_unpackhi_epi32(m0, m1); - let t1 = _mm_unpacklo_epi32(m0, m2); - let buf = _mm_unpacklo_epi64(t0, t1); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_srli_si128(m2, 4); - let t1 = _mm_blend_epi16(m0, m3, 0x03); - let buf = _mm_blend_epi16(t1, t0, 0x3C); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_blend_epi16(m1, m0, 0x0C); - let t1 = _mm_srli_si128(m3, 4); - let t2 = _mm_blend_epi16(t0, t1, 0x30); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 3, 0, 1)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_unpacklo_epi64(m2, m1); - let t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE!(2, 0, 1, 0)); - let t2 = _mm_srli_si128(t0, 4); - let buf = _mm_blend_epi16(t1, t2, 0x33); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - // round 7 - let t0 = _mm_slli_si128(m1, 12); - let t1 = _mm_blend_epi16(m0, m3, 0x33); - let buf = _mm_blend_epi16(t1, t0, 0xC0); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_blend_epi16(m3, m2, 0x30); - let t1 = _mm_srli_si128(m1, 4); - let t2 = _mm_blend_epi16(t0, t1, 0x03); - let buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 3, 0)); - g2(row1, row2, row3, row4, buf); - diagonalize(row1, row3, row4); - let t0 = _mm_unpacklo_epi64(m0, m2); - let t1 = _mm_srli_si128(m1, 4); - let buf = _mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE!(3, 1, 0, 2)); - g1(row1, row2, row3, row4, buf); - let t0 = _mm_unpackhi_epi32(m1, m2); - let t1 = _mm_unpackhi_epi64(m0, t0); - let buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE!(0, 1, 2, 3)); - g2(row1, row2, row3, row4, buf); - undiagonalize(row1, row3, row4); - - [*row1, *row2, *row3, *row4] + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse4.1")] @@ -319,9 +331,9 @@ pub unsafe fn compress_in_place( counter: u64, flags: u8, ) { - let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8); + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse4.1")] @@ -332,13 +344,13 @@ pub unsafe fn compress_xof( counter: u64, flags: u8, ) -> [u8; 64] { - let [mut row1, mut row2, mut row3, mut row4] = + let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); row1 = xor(row1, row3); - row2 = xor(row2, row4); - row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8)); - row4 = xor(row4, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row1, row2, row3, row4]) + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] |
