diff options
| author | Jack O'Connor <[email protected]> | 2019-12-10 14:20:09 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2019-12-11 18:05:26 -0500 |
| commit | 52ea6487f88a0e5cbc2f784f3095539afe6c91e4 (patch) | |
| tree | 181508c1840c2961e530e982c4525029d79e5685 /src | |
| parent | d68882da0d897c93a271a7c0f6d6b9b13d13aa16 (diff) | |
switch to representing CVs as words for the compression function
The portable implementation was getting slowed down by converting back
and forth between words and bytes.
I made the corresponding change on the C side first
(https://github.com/veorq/BLAKE3-c/commit/12a37be8b50922a358c016ba07f46816a3da4a31),
and as part of this commit I'm re-vendoring the C code. I'm also
exposing a small FFI interface to C so that blake3_neon.c can link
against portable.rs rather than blake3_portable.c, see c_neon.rs.
Diffstat (limited to 'src')
| -rw-r--r-- | src/avx2.rs | 23 | ||||
| -rw-r--r-- | src/c/blake3.h | 12 | ||||
| -rw-r--r-- | src/c/blake3_avx512.c | 213 | ||||
| -rw-r--r-- | src/c/blake3_impl.h | 70 | ||||
| -rw-r--r-- | src/c/blake3_neon.c | 25 | ||||
| -rw-r--r-- | src/c/blake3_portable.c | 154 | ||||
| -rw-r--r-- | src/c_avx512.rs | 65 | ||||
| -rw-r--r-- | src/c_neon.rs | 41 | ||||
| -rw-r--r-- | src/lib.rs | 103 | ||||
| -rw-r--r-- | src/platform.rs | 119 | ||||
| -rw-r--r-- | src/portable.rs | 133 | ||||
| -rw-r--r-- | src/sse41.rs | 85 | ||||
| -rw-r--r-- | src/test.rs | 48 |
13 files changed, 549 insertions, 542 deletions
diff --git a/src/avx2.rs b/src/avx2.rs index 14673c6..471a2dc 100644 --- a/src/avx2.rs +++ b/src/avx2.rs @@ -3,7 +3,7 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN}; +use crate::{offset_high, offset_low, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN}; use arrayref::{array_mut_ref, mut_array_refs}; pub const DEGREE: usize = 8; @@ -299,7 +299,7 @@ unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m256i, _ pub unsafe fn hash8( inputs: &[*const u8; DEGREE], blocks: usize, - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -307,16 +307,15 @@ pub unsafe fn hash8( flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { - let key_words: [u32; 8] = core::mem::transmute(*key); // x86 is little-endian let mut h_vecs = [ - set1(key_words[0]), - set1(key_words[1]), - set1(key_words[2]), - set1(key_words[3]), - set1(key_words[4]), - set1(key_words[5]), - set1(key_words[6]), - set1(key_words[7]), + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), ]; let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas); let mut block_flags = flags | flags_start; @@ -384,7 +383,7 @@ pub unsafe fn hash8( #[target_feature(enable = "avx2")] pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( mut inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, mut offset: u64, offset_deltas: &OffsetDeltas, flags: u8, diff --git a/src/c/blake3.h b/src/c/blake3.h index 8af3e04..f6e75d8 100644 --- a/src/c/blake3.h +++ b/src/c/blake3.h @@ -10,8 +10,7 @@ #define BLAKE3_MAX_SIMD_DEGREE 16 typedef struct { - uint8_t cv[32]; - uint8_t key[BLAKE3_KEY_LEN]; + uint32_t cv[8]; uint64_t offset; uint16_t count; uint8_t buf[BLAKE3_BLOCK_LEN]; @@ -20,9 +19,10 @@ typedef struct { } blake3_chunk_state; typedef struct { + uint32_t key[8]; blake3_chunk_state chunk; - uint8_t subtree_hashes_len; - uint8_t subtree_hashes[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN]; + uint8_t cv_stack_len; + uint8_t cv_stack[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN]; } blake3_hasher; void blake3_hasher_init(blake3_hasher *self); @@ -32,5 +32,5 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); -void blake3_hasher_finalize(const blake3_hasher *self, - uint8_t out[BLAKE3_OUT_LEN]); +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c index 7b0dce1..f30e302 100644 --- a/src/c/blake3_avx512.c +++ b/src/c/blake3_avx512.c @@ -74,50 +74,49 @@ INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } * ---------------------------------------------------------------------------- */ -INLINE void g1(__m128i *row1, __m128i *row2, __m128i *row3, __m128i *row4, +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { - *row1 = add_128(add_128(*row1, m), *row2); - *row4 = xor_128(*row4, *row1); - *row4 = rot16_128(*row4); - *row3 = add_128(*row3, *row4); - *row2 = xor_128(*row2, *row3); - *row2 = rot12_128(*row2); + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot16_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot12_128(*row1); } -INLINE void g2(__m128i *row1, __m128i *row2, __m128i *row3, __m128i *row4, +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m) { - *row1 = add_128(add_128(*row1, m), *row2); - *row4 = xor_128(*row4, *row1); - *row4 = rot8_128(*row4); - *row3 = add_128(*row3, *row4); - *row2 = xor_128(*row2, *row3); - *row2 = rot7_128(*row2); + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot8_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot7_128(*row1); } -// Note the optimization here of leaving row2 as the unrotated row, rather than -// row1. All the message loads below are adjusted to compensate for this. See +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 -INLINE void diagonalize(__m128i *row1, __m128i *row3, __m128i *row4) { - *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE(2, 1, 0, 3)); - *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE(1, 0, 3, 2)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(0, 3, 2, 1)); +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); } -INLINE void undiagonalize(__m128i *row1, __m128i *row3, __m128i *row4) { - *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE(0, 3, 2, 1)); - *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE(1, 0, 3, 2)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(2, 1, 0, 3)); +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } -void blake3_compress_avx512(const uint8_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, uint8_t flags, - uint8_t out[64]) { - __m128i row1 = loadu_128(&cv[0]); - __m128i row2 = loadu_128(&cv[16]); - __m128i row3 = set4(IV[0], IV[1], IV[2], IV[3]); - __m128i row4 = set4(offset_low(offset), offset_high(offset), - (uint32_t)block_len, (uint32_t)flags); +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, uint8_t flags) { + rows[0] = loadu_128((uint8_t *)&cv[0]); + rows[1] = loadu_128((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(offset_low(offset), offset_high(offset), (uint32_t)block_len, + (uint32_t)flags); __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); @@ -129,160 +128,177 @@ void blake3_compress_avx512(const uint8_t cv[8], // round 1 buf = _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0))); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); buf = _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(3, 1, 3, 1))); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3, 2, 0, 1)); t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 3, 2)); buf = _mm_blend_epi16(t0, t1, 0xC3); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_blend_epi16(t0, t1, 0x3C); buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2, 3, 0, 1)); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); // round 2 t0 = _mm_blend_epi16(m1, m2, 0x0C); t1 = _mm_slli_si128(m3, 4); t2 = _mm_blend_epi16(t0, t1, 0xF0); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 0, 2, 0)); t1 = _mm_blend_epi16(m1, m3, 0xC0); t2 = _mm_blend_epi16(t0, t1, 0xF0); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1)); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_slli_si128(m1, 4); t1 = _mm_blend_epi16(m2, t0, 0x30); t2 = _mm_blend_epi16(m0, t1, 0xF0); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_unpackhi_epi32(m0, m1); t1 = _mm_slli_si128(m3, 4); t2 = _mm_blend_epi16(t0, t1, 0x0C); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2)); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); // round 3 t0 = _mm_unpackhi_epi32(m2, m3); t1 = _mm_blend_epi16(m3, m1, 0x0C); t2 = _mm_blend_epi16(t0, t1, 0x0F); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_unpacklo_epi32(m2, m0); t1 = _mm_blend_epi16(t0, m0, 0xF0); t2 = _mm_slli_si128(m3, 8); buf = _mm_blend_epi16(t1, t2, 0xC0); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_blend_epi16(m0, m2, 0x3C); t1 = _mm_srli_si128(m1, 12); t2 = _mm_blend_epi16(t0, t1, 0x03); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0, 3, 2, 1)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_slli_si128(m3, 4); t1 = _mm_blend_epi16(m0, m1, 0x33); t2 = _mm_blend_epi16(t1, t0, 0xC0); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 2, 3, 0)); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); // round 4 t0 = _mm_unpackhi_epi32(m0, m1); t1 = _mm_unpackhi_epi32(t0, m2); t2 = _mm_blend_epi16(t1, m3, 0x0C); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_slli_si128(m2, 8); t1 = _mm_blend_epi16(m3, m0, 0x0C); t2 = _mm_blend_epi16(t1, t0, 0xC0); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3)); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_blend_epi16(m0, m1, 0x0F); t1 = _mm_blend_epi16(t0, m3, 0xC0); buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_alignr_epi8(m0, m1, 4); buf = _mm_blend_epi16(t0, m2, 0x33); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); // round 5 t0 = _mm_unpacklo_epi64(m1, m2); t1 = _mm_unpackhi_epi64(m0, m2); t2 = _mm_blend_epi16(t0, t1, 0x33); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_unpackhi_epi64(m1, m3); t1 = _mm_unpacklo_epi64(m0, m1); buf = _mm_blend_epi16(t0, t1, 0x33); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_unpackhi_epi64(m3, m1); t1 = _mm_unpackhi_epi64(m2, m0); t2 = _mm_blend_epi16(t1, t0, 0x33); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_blend_epi16(m0, m2, 0x03); t1 = _mm_slli_si128(t0, 8); t2 = _mm_blend_epi16(t1, m3, 0x0F); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 3, 1)); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); // round 6 t0 = _mm_unpackhi_epi32(m0, m1); t1 = _mm_unpacklo_epi32(m0, m2); buf = _mm_unpacklo_epi64(t0, t1); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_srli_si128(m2, 4); t1 = _mm_blend_epi16(m0, m3, 0x03); buf = _mm_blend_epi16(t1, t0, 0x3C); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_blend_epi16(m1, m0, 0x0C); t1 = _mm_srli_si128(m3, 4); t2 = _mm_blend_epi16(t0, t1, 0x30); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_unpacklo_epi64(m2, m1); t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2, 0, 1, 0)); t2 = _mm_srli_si128(t0, 4); buf = _mm_blend_epi16(t1, t2, 0x33); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); // round 7 t0 = _mm_slli_si128(m1, 12); t1 = _mm_blend_epi16(m0, m3, 0x33); buf = _mm_blend_epi16(t1, t0, 0xC0); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_blend_epi16(m3, m2, 0x30); t1 = _mm_srli_si128(m1, 4); t2 = _mm_blend_epi16(t0, t1, 0x03); buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 3, 0)); - g2(&row1, &row2, &row3, &row4, buf); - diagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + diagonalize(&rows[0], &rows[2], &rows[3]); t0 = _mm_unpacklo_epi64(m0, m2); t1 = _mm_srli_si128(m1, 4); buf = _mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE(3, 1, 0, 2)); - g1(&row1, &row2, &row3, &row4, buf); + g1(&rows[0], &rows[1], &rows[2], &rows[3], buf); t0 = _mm_unpackhi_epi32(m1, m2); t1 = _mm_unpackhi_epi64(m0, t0); buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3)); - g2(&row1, &row2, &row3, &row4, buf); - undiagonalize(&row1, &row3, &row4); + g2(&rows[0], &rows[1], &rows[2], &rows[3], buf); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, offset, flags); + storeu_128(xor_128(rows[0], rows[2]), &out[0]); + storeu_128(xor_128(rows[1], rows[3]), &out[16]); + storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); + storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); +} - storeu_128(xor_128(row1, row3), &out[0]); - storeu_128(xor_128(row2, row4), &out[16]); - storeu_128(xor_128(row3, loadu_128(&cv[0])), &out[32]); - storeu_128(xor_128(row4, loadu_128(&cv[16])), &out[48]); +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, offset, flags); + storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); } /* @@ -461,15 +477,12 @@ INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4], } void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset, + const uint32_t key[8], uint64_t offset, offset_deltas_t offset_deltas, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - uint32_t key_words[8]; - memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian __m128i h_vecs[8] = { - set1_128(key_words[0]), set1_128(key_words[1]), set1_128(key_words[2]), - set1_128(key_words[3]), set1_128(key_words[4]), set1_128(key_words[5]), - set1_128(key_words[6]), set1_128(key_words[7]), + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; __m128i offset_low_vec, offset_high_vec; load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec); @@ -710,15 +723,12 @@ INLINE void load_offsets8(uint64_t offset, const uint64_t deltas[8], } void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset, + const uint32_t key[8], uint64_t offset, offset_deltas_t offset_deltas, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - uint32_t key_words[8]; - memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian __m256i h_vecs[8] = { - set1_256(key_words[0]), set1_256(key_words[1]), set1_256(key_words[2]), - set1_256(key_words[3]), set1_256(key_words[4]), set1_256(key_words[5]), - set1_256(key_words[6]), set1_256(key_words[7]), + set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), + set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), }; __m256i offset_low_vec, offset_high_vec; load_offsets8(offset, offset_deltas, &offset_low_vec, &offset_high_vec); @@ -1023,16 +1033,13 @@ INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16], } void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset, + const uint32_t key[8], uint64_t offset, offset_deltas_t offset_deltas, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - uint32_t key_words[8]; - memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian __m512i h_vecs[8] = { - set1_512(key_words[0]), set1_512(key_words[1]), set1_512(key_words[2]), - set1_512(key_words[3]), set1_512(key_words[4]), set1_512(key_words[5]), - set1_512(key_words[6]), set1_512(key_words[7]), + set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), + set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), }; __m512i offset_low_vec, offset_high_vec; load_offsets16(offset, offset_deltas, &offset_low_vec, &offset_high_vec); @@ -1107,20 +1114,18 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, */ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset, + const uint32_t key[8], uint64_t offset, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint8_t cv[32]; + uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { if (blocks == 1) { block_flags |= flags_end; } - uint8_t out[64]; - blake3_compress_avx512(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags, - out); - memcpy(cv, out, 32); + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, offset, + block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; @@ -1129,7 +1134,7 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, } void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t offset_deltas, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h index d2e0325..af55d93 100644 --- a/src/c/blake3_impl.h +++ b/src/c/blake3_impl.h @@ -32,12 +32,6 @@ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; -static const uint8_t IV_BYTES[32] = { - 0x67, 0xe6, 0x09, 0x6a, 0x85, 0xae, 0x67, 0xbb, 0x72, 0xf3, 0x6e, - 0x3c, 0x3a, 0xf5, 0x4f, 0xa5, 0x7f, 0x52, 0x0e, 0x51, 0x8c, 0x68, - 0x05, 0x9b, 0xab, 0xd9, 0x83, 0x1f, 0x19, 0xcd, 0xe0, 0x5b, -}; - static const uint8_t MSG_SCHEDULE[7][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, @@ -81,41 +75,71 @@ INLINE uint32_t offset_high(uint64_t offset) { return (uint32_t)(offset >> 32); } +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + // Declarations for implementation-specific functions. -void blake3_compress_portable(const uint8_t cv[BLAKE3_OUT_LEN], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, uint8_t flags, - uint8_t out[64]); -void blake3_compress_sse41(const uint8_t cv[BLAKE3_OUT_LEN], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, uint8_t flags, - uint8_t out[64]); -void blake3_compress_avx512(const uint8_t cv[BLAKE3_OUT_LEN], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, uint8_t flags, - uint8_t out[64]); +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags); +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags); +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags); +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags, uint8_t out[64]); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags, uint8_t out[64]); +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t offset, + uint8_t flags, uint8_t out[64]); void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t od, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t od, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t od, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t od, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t od, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c index 86a99bf..0ffa17e 100644 --- a/src/c/blake3_neon.c +++ b/src/c/blake3_neon.c @@ -223,17 +223,12 @@ INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4], } void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset, + const uint32_t key[8], uint64_t offset, offset_deltas_t offset_deltas, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - uint32_t key_words[8]; - // TODO: This is assuming little-endian. Is there such a thing as NEON on - // big-endian? - memcpy(key_words, key, BLAKE3_KEY_LEN); uint32x4_t h_vecs[8] = { - set1_128(key_words[0]), set1_128(key_words[1]), set1_128(key_words[2]), - set1_128(key_words[3]), set1_128(key_words[4]), set1_128(key_words[5]), - set1_128(key_words[6]), set1_128(key_words[7]), + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; uint32x4_t offset_low_vec, offset_high_vec; load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec); @@ -294,10 +289,10 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, */ INLINE void hash_one_neon(const uint8_t *input, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset, - uint8_t flags, uint8_t flags_start, uint8_t flags_end, + const uint32_t key[8], uint64_t offset, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { - uint8_t cv[32]; + uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); uint8_t block_flags = flags | flags_start; while (blocks > 0) { @@ -307,10 +302,8 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks, // TODO: Implement compress_neon. However note that according to // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, // compress_neon might not be any faster than compress_portable. - uint8_t out[64]; - blake3_compress_portable(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags, - out); - memcpy(cv, out, 32); + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, offset, + block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; block_flags = flags; @@ -319,7 +312,7 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks, } void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], + size_t blocks, const uint32_t key[8], uint64_t offset, offset_deltas_t offset_deltas, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { diff --git a/src/c/blake3_portable.c b/src/c/blake3_portable.c deleted file mode 100644 index 6b58cf4..0000000 --- a/src/c/blake3_portable.c +++ /dev/null @@ -1,154 +0,0 @@ -#include "blake3_impl.h" -#include <string.h> - -INLINE uint32_t load32(const void *src) { - const uint8_t *p = (const uint8_t *)src; - return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | - ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); -} - -INLINE void store32(void *dst, uint32_t w) { - uint8_t *p = (uint8_t *)dst; - p[0] = (uint8_t)(w >> 0); - p[1] = (uint8_t)(w >> 8); - p[2] = (uint8_t)(w >> 16); - p[3] = (uint8_t)(w >> 24); -} - -INLINE uint32_t rotr32(uint32_t w, uint32_t c) { - return (w >> c) | (w << (32 - c)); -} - -INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, - uint32_t x, uint32_t y) { - state[a] = state[a] + state[b] + x; - state[d] = rotr32(state[d] ^ state[a], 16); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 12); - state[a] = state[a] + state[b] + y; - state[d] = rotr32(state[d] ^ state[a], 8); - state[c] = state[c] + state[d]; - state[b] = rotr32(state[b] ^ state[c], 7); -} - -INLINE void round_fn(uint32_t *state, const uint32_t *msg, size_t round) { - // Select the message schedule based on the round. - const uint8_t *schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the rows. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -void blake3_compress_portable(const uint8_t cv[BLAKE3_OUT_LEN], - const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, uint8_t flags, - uint8_t out[64]) { - uint32_t block_words[16]; - block_words[0] = load32(block + 4 * 0); - block_words[1] = load32(block + 4 * 1); - block_words[2] = load32(block + 4 * 2); - block_words[3] = load32(block + 4 * 3); - block_words[4] = load32(block + 4 * 4); - block_words[5] = load32(block + 4 * 5); - block_words[6] = load32(block + 4 * 6); - block_words[7] = load32(block + 4 * 7); - block_words[8] = load32(block + 4 * 8); - block_words[9] = load32(block + 4 * 9); - block_words[10] = load32(block + 4 * 10); - block_words[11] = load32(block + 4 * 11); - block_words[12] = load32(block + 4 * 12); - block_words[13] = load32(block + 4 * 13); - block_words[14] = load32(block + 4 * 14); - block_words[15] = load32(block + 4 * 15); - - uint32_t state[16] = { - load32(&cv[0 * 4]), - load32(&cv[1 * 4]), - load32(&cv[2 * 4]), - load32(&cv[3 * 4]), - load32(&cv[4 * 4]), - load32(&cv[5 * 4]), - load32(&cv[6 * 4]), - load32(&cv[7 * 4]), - IV[0], - IV[1], - IV[2], - IV[3], - offset_low(offset), - offset_high(offset), - (uint32_t)block_len, - (uint32_t)flags, - }; - - round_fn(&state[0], &block_words[0], 0); - round_fn(&state[0], &block_words[0], 1); - round_fn(&state[0], &block_words[0], 2); - round_fn(&state[0], &block_words[0], 3); - round_fn(&state[0], &block_words[0], 4); - round_fn(&state[0], &block_words[0], 5); - round_fn(&state[0], &block_words[0], 6); - - store32(&out[0 * 4], state[0] ^ state[8]); - store32(&out[1 * 4], state[1] ^ state[9]); - store32(&out[2 * 4], state[2] ^ state[10]); - store32(&out[3 * 4], state[3] ^ state[11]); - store32(&out[4 * 4], state[4] ^ state[12]); - store32(&out[5 * 4], state[5] ^ state[13]); - store32(&out[6 * 4], state[6] ^ state[14]); - store32(&out[7 * 4], state[7] ^ state[15]); - store32(&out[8 * 4], state[8] ^ cv[0]); - store32(&out[9 * 4], state[9] ^ cv[1]); - store32(&out[10 * 4], state[10] ^ cv[2]); - store32(&out[11 * 4], state[11] ^ cv[3]); - store32(&out[12 * 4], state[12] ^ cv[4]); - store32(&out[13 * 4], state[13] ^ cv[5]); - store32(&out[14 * 4], state[14] ^ cv[6]); - store32(&out[15 * 4], state[15] ^ cv[7]); -} - -INLINE void hash_one_portable(const uint8_t *input, size_t blocks, - const uint8_t key[BLAKE3_KEY_LEN], - uint64_t offset, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t out[BLAKE3_OUT_LEN]) { - uint8_t cv[32]; - memcpy(cv, key, 32); - uint8_t block_flags = flags | flags_start; - while (blocks > 0) { - if (blocks == 1) { - block_flags |= flags_end; - } - uint8_t out[64]; - blake3_compress_portable(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags, - out); - memcpy(cv, out, 32); - input = &input[BLAKE3_BLOCK_LEN]; - blocks -= 1; - block_flags = flags; - } - memcpy(out, cv, 32); -} - -void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, - size_t blocks, const uint8_t key[BLAKE3_KEY_LEN], - uint64_t offset, offset_deltas_t offset_deltas, - uint8_t flags, uint8_t flags_start, - uint8_t flags_end, uint8_t *out) { - while (num_inputs > 0) { - hash_one_portable(inputs[0], blocks, key, offset, flags, flags_start, - flags_end, out); - inputs += 1; - num_inputs -= 1; - offset += offset_deltas[1]; - out = &out[BLAKE3_OUT_LEN]; - } -} diff --git a/src/c_avx512.rs b/src/c_avx512.rs index 4ea2542..0120a2c 100644 --- a/src/c_avx512.rs +++ b/src/c_avx512.rs @@ -1,17 +1,29 @@ -use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN}; +use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN}; +use arrayref::array_ref; pub const DEGREE: usize = 16; // Unsafe because this may only be called on platforms supporting AVX-512. -pub unsafe fn compress( - cv: &[u8; 32], +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, offset, flags) +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_xof( + cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8, ) -> [u8; 64] { let mut out = [0u8; 64]; - ffi::blake3_compress_avx512( + ffi::blake3_compress_xof_avx512( cv.as_ptr(), block.as_ptr(), block_len, @@ -25,7 +37,7 @@ pub unsafe fn compress( // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -53,53 +65,26 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( pub mod ffi { extern "C" { - pub fn blake3_compress_avx512( - cv: *const u8, + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, block: *const u8, block_len: u8, offset: u64, flags: u8, - out: *mut u8, - ); - // hash4/hash8/hash16 are exposed here for benchmarks. - pub fn blake3_hash4_avx512( - inputs: *const *const u8, - blocks: usize, - key: *const u8, - offset: u64, - offset_deltas: *const u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); - pub fn blake3_hash8_avx512( - inputs: *const *const u8, - blocks: usize, - key: *const u8, - offset: u64, - offset_deltas: *const u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, ); - pub fn blake3_hash16_avx512( - inputs: *const *const u8, - blocks: usize, - key: *const u8, + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, offset: u64, - offset_deltas: *const u64, flags: u8, - flags_start: u8, - flags_end: u8, out: *mut u8, ); pub fn blake3_hash_many_avx512( inputs: *const *const u8, num_inputs: usize, blocks: usize, - key: *const u8, + key: *const u32, offset: u64, offset_deltas: *const u64, flags: u8, @@ -119,7 +104,7 @@ mod test { if !crate::platform::avx512_detected() { return; } - crate::test::test_compress_fn(compress); + crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] diff --git a/src/c_neon.rs b/src/c_neon.rs index 15e76d2..029bf7e 100644 --- a/src/c_neon.rs +++ b/src/c_neon.rs @@ -1,11 +1,11 @@ -use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN}; +use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN}; pub const DEGREE: usize = 4; // Unsafe because this may only be called on platforms supporting NEON. pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -31,25 +31,36 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( ) } +// blake3_neon.c normally depends on blake3_portable.c, because the NEON +// implementation only provides 4x compression, and it relies on the portable +// implementation for 1x compression. However, we expose the portable Rust +// implementation here instead, to avoid linking in unnecessary code. +#[no_mangle] +pub extern "C" fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + offset: u64, + flags: u8, +) { + unsafe { + crate::portable::compress_in_place( + &mut *(cv as *mut [u32; 8]), + &*(block as *const [u8; 64]), + block_len, + offset, + flags, + ) + } +} + pub mod ffi { extern "C" { - // Exposed here for benchmarks. - pub fn blake3_hash4_neon( - inputs: *const *const u8, - blocks: usize, - key: *const u8, - offset: u64, - offset_deltas: *const u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: *mut u8, - ); pub fn blake3_hash_many_neon( inputs: *const *const u8, num_inputs: usize, blocks: usize, - key: *const u8, + key: *const u32, offset: u64, offset_deltas: *const u64, flags: u8, @@ -40,15 +40,18 @@ pub const BLOCK_LEN: usize = 64; #[doc(hidden)] pub const CHUNK_LEN: usize = 2048; -const IV: &[u32; 8] = &[ +// While iterating the compression function within a chunk, the CV is +// represented as words, to avoid doing two extra endianness conversions for +// each compression in the portable implementation. But the hash_many interface +// needs to hash both input bytes and parent nodes, so its better for its +// output CVs to be represented as bytes. +type CVWords = [u32; 8]; +type CVBytes = [u8; 32]; // little-endian + +const IV: &CVWords = &[ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, ]; -const IV_BYTES: &[u8; 32] = &[ - 0x67, 0xe6, 0x09, 0x6a, 0x85, 0xae, 0x67, 0xbb, 0x72, 0xf3, 0x6e, 0x3c, 0x3a, 0xf5, 0x4f, 0xa5, - 0x7f, 0x52, 0x0e, 0x51, 0x8c, 0x68, 0x05, 0x9b, 0xab, 0xd9, 0x83, 0x1f, 0x19, 0xcd, 0xe0, 0x5b, -]; - const MSG_SCHEDULE: [[usize; 16]; 7] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], @@ -173,7 +176,7 @@ impl fmt::Debug for Hash { // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. struct Output { - input_chaining_value: [u8; 32], + input_chaining_value: CVWords, block: [u8; 64], block_len: u8, offset: u64, @@ -182,34 +185,31 @@ struct Output { } impl Output { - fn chaining_value(&self) -> [u8; 32] { - let out = self.platform.compress( - &self.input_chaining_value, + fn chaining_value(&self) -> CVBytes { + let mut cv = self.input_chaining_value; + self.platform.compress_in_place( + &mut cv, &self.block, self.block_len, self.offset, self.flags, ); - *array_ref!(out, 0, 32) + platform::le_bytes_from_words_32(&cv) } fn root_hash(&self) -> Hash { debug_assert_eq!(self.offset, 0); - let out = self.platform.compress( - &self.input_chaining_value, - &self.block, - self.block_len, - 0, - self.flags | ROOT, - ); - Hash(*array_ref!(out, 0, 32)) + let mut cv = self.input_chaining_value; + self.platform + .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); + Hash(platform::le_bytes_from_words_32(&cv)) } fn root_output_bytes(&self, out_slice: &mut [u8]) { debug_assert_eq!(self.offset, 0); let mut offset = 0; for out_block in out_slice.chunks_mut(2 * OUT_LEN) { - let out_bytes = self.platform.compress( + let out_bytes = self.platform.compress_xof( &self.input_chaining_value, &self.block, self.block_len, @@ -224,7 +224,7 @@ impl Output { #[derive(Clone)] struct ChunkState { - cv: [u8; 32], + cv: CVWords, offset: u64, buf: [u8; BLOCK_LEN], buf_len: u8, @@ -234,7 +234,7 @@ struct ChunkState { } impl ChunkState { - fn new(key: &[u8; 32], offset: u64, flags: u8, platform: Platform) -> Self { + fn new(key: &CVWords, offset: u64, flags: u8, platform: Platform) -> Self { Self { cv: *key, offset, @@ -246,7 +246,7 @@ impl ChunkState { } } - fn reset(&mut self, key: &[u8; KEY_LEN], new_offset: u64) { + fn reset(&mut self, key: &CVWords, new_offset: u64) { debug_assert_eq!(new_offset % CHUNK_LEN as u64, 0); self.cv = *key; self.offset = new_offset; @@ -283,14 +283,13 @@ impl ChunkState { if !input.is_empty() { debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); let block_flags = self.flags | self.start_flag(); // borrowck - let output = self.platform.compress( - &self.cv, + self.platform.compress_in_place( + &mut self.cv, &self.buf, BLOCK_LEN as u8, self.offset, block_flags, ); - self.cv = *array_ref!(output, 0, 32); self.buf_len = 0; self.buf = [0; BLOCK_LEN]; self.blocks_compressed += 1; @@ -300,14 +299,13 @@ impl ChunkState { while input.len() > BLOCK_LEN { debug_assert_eq!(self.buf_len, 0); let block_flags = self.flags | self.start_flag(); // borrowck - let output = self.platform.compress( - &self.cv, + self.platform.compress_in_place( + &mut self.cv, array_ref!(input, 0, BLOCK_LEN), BLOCK_LEN as u8, self.offset, block_flags, ); - self.cv = *array_ref!(output, 0, 32); self.blocks_compressed += 1; input = &input[BLOCK_LEN..]; } @@ -400,7 +398,7 @@ where // those cases use a different codepath. fn compress_chunks_parallel( input: &[u8], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, flags: u8, platform: Platform, @@ -448,7 +446,7 @@ fn compress_chunks_parallel( // never empty; those cases use a different codepath. fn compress_parents_parallel( child_chaining_values: &[u8], - key: &[u8; KEY_LEN], + key: &CVWords, flags: u8, platform: Platform, out: &mut [u8], @@ -501,7 +499,7 @@ fn compress_parents_parallel( // codepath. fn compress_subtree_wide( input: &[u8], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, flags: u8, platform: Platform, @@ -573,7 +571,7 @@ fn compress_subtree_wide( // chunk or less. That's a different codepath. fn compress_subtree_to_parent_node( input: &[u8], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, flags: u8, platform: Platform, @@ -597,7 +595,7 @@ fn compress_subtree_to_parent_node( // Hash a complete input all at once. Unlike compress_subtree_wide() and // compress_subtree_to_parent_node(), this function handles the 1 chunk case. -fn hash_all_at_once(input: &[u8], key: &[u8; KEY_LEN], flags: u8) -> Output { +fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { let platform = Platform::detect(); // If the whole subtree is one chunk, hash it directly with a ChunkState. @@ -621,23 +619,25 @@ fn hash_all_at_once(input: &[u8], key: &[u8; KEY_LEN], flags: u8) -> Output { /// The default hash function. pub fn hash(input: &[u8]) -> Hash { - hash_all_at_once(input, IV_BYTES, 0).root_hash() + hash_all_at_once(input, IV, 0).root_hash() } /// The keyed hash function. pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { - hash_all_at_once(input, key, KEYED_HASH).root_hash() + let key_words = platform::words_from_le_bytes_32(key); + hash_all_at_once(input, &key_words, KEYED_HASH).root_hash() } /// The key derivation function. pub fn derive_key(key: &[u8; KEY_LEN], context: &[u8]) -> Hash { - hash_all_at_once(context, key, DERIVE_KEY).root_hash() + let key_words = platform::words_from_le_bytes_32(key); + hash_all_at_once(context, &key_words, DERIVE_KEY).root_hash() } fn parent_node_output( - left_child: &[u8; 32], - right_child: &[u8; 32], - key: &[u8; KEY_LEN], + left_child: &CVBytes, + right_child: &CVBytes, + key: &CVWords, flags: u8, platform: Platform, ) -> Output { @@ -658,14 +658,14 @@ fn parent_node_output( /// support for extendable output. #[derive(Clone)] pub struct Hasher { - key: [u8; KEY_LEN], + key: CVWords, chunk_state: ChunkState, // 2^53 * 2048 = 2^64 - cv_stack: ArrayVec<[[u8; OUT_LEN]; 53]>, + cv_stack: ArrayVec<[CVBytes; 53]>, } impl Hasher { - fn new_internal(key: &[u8; 32], flags: u8) -> Self { + fn new_internal(key: &CVWords, flags: u8) -> Self { Self { key: *key, chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), @@ -675,12 +675,13 @@ impl Hasher { /// Construct a new `Hasher` for the regular hash function. pub fn new() -> Self { - Self::new_internal(IV_BYTES, 0) + Self::new_internal(IV, 0) } /// Construct a new `Hasher` for the keyed hash function. pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - Self::new_internal(key, KEYED_HASH) + let key_words = platform::words_from_le_bytes_32(key); + Self::new_internal(&key_words, KEYED_HASH) } /// Construct a new `Hasher` for the key derivation function. @@ -691,7 +692,8 @@ impl Hasher { /// /// [`derive_key`]: fn.derive_key.html pub fn new_derive_key(key: &[u8; KEY_LEN]) -> Self { - Self::new_internal(key, DERIVE_KEY) + let key_words = platform::words_from_le_bytes_32(key); + Self::new_internal(&key_words, DERIVE_KEY) } /// The total number of input bytes so far. @@ -705,19 +707,18 @@ impl Hasher { while self.cv_stack.len() > post_merge_stack_len { let right_child = self.cv_stack.pop().unwrap(); let left_child = self.cv_stack.pop().unwrap(); - let parent_cv = parent_node_output( + let parent_output = parent_node_output( &left_child, &right_child, &self.key, self.chunk_state.flags, self.chunk_state.platform, - ) - .chaining_value(); - self.cv_stack.push(parent_cv); + ); + self.cv_stack.push(parent_output.chaining_value()); } } - fn push_cv(&mut self, new_cv: &[u8; 32], offset: u64) { + fn push_cv(&mut self, new_cv: &CVBytes, offset: u64) { // In reference_impl.rs, we merge the new CV with existing CVs from the // stack before pushing it. We can do that because we know more input // is coming, so we know none of the merges are root. diff --git a/src/platform.rs b/src/platform.rs index 4621fc5..b42f7b9 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -1,4 +1,5 @@ -use crate::{portable, OffsetDeltas, BLOCK_LEN, KEY_LEN}; +use crate::{portable, CVWords, OffsetDeltas, BLOCK_LEN}; +use arrayref::{array_mut_ref, array_ref}; #[cfg(feature = "c_avx512")] use crate::c_avx512; @@ -91,27 +92,55 @@ impl Platform { degree } - pub(crate) fn compress( + pub(crate) fn compress_in_place( &self, - cv: &[u8; 32], + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, + ) { + match self { + Platform::Portable => portable::compress_in_place(cv, block, block_len, offset, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + sse41::compress_in_place(cv, block, block_len, offset, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(feature = "c_avx512")] + Platform::AVX512 => unsafe { + c_avx512::compress_in_place(cv, block, block_len, offset, flags) + }, + // No NEON compress_in_place() implementation yet. + #[cfg(feature = "c_neon")] + Platform::NEON => portable::compress_in_place(cv, block, block_len, offset, flags), + } + } + + pub(crate) fn compress_xof( + &self, + cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8, ) -> [u8; 64] { match self { - Platform::Portable => portable::compress(cv, block, block_len, offset, flags), + Platform::Portable => portable::compress_xof(cv, block, block_len, offset, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress(cv, block, block_len, offset, flags) + sse41::compress_xof(cv, block, block_len, offset, flags) }, // Safe because detect() checked for platform support. #[cfg(feature = "c_avx512")] - Platform::AVX512 => unsafe { c_avx512::compress(cv, block, block_len, offset, flags) }, - // No NEON compress() implementation yet. + Platform::AVX512 => unsafe { + c_avx512::compress_xof(cv, block, block_len, offset, flags) + }, + // No NEON compress_xof() implementation yet. #[cfg(feature = "c_neon")] - Platform::NEON => portable::compress(cv, block, block_len, offset, flags), + Platform::NEON => portable::compress_xof(cv, block, block_len, offset, flags), } } @@ -128,7 +157,7 @@ impl Platform { pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>( &self, inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -262,3 +291,75 @@ pub fn sse41_detected() -> bool { } false } + +#[inline(always)] +pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { + let mut out = [0; 8]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out +} + +#[inline(always)] +pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { + let mut out = [0; 16]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); + out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); + out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); + out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); + out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); + out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); + out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); + out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { + let mut out = [0; 64]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); + *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); + *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); + *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); + *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); + *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); + *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); + *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); + out +} diff --git a/src/portable.rs b/src/portable.rs index b07c46a..fa0e17d 100644 --- a/src/portable.rs +++ b/src/portable.rs @@ -1,4 +1,6 @@ -use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN}; +use crate::{ + offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; use arrayref::{array_mut_ref, array_ref}; #[inline(always)] @@ -31,41 +33,25 @@ fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } -pub fn compress( - cv: &[u8; 32], +#[inline(always)] +fn compress_pre( + cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8, -) -> [u8; 64] { - let block_words = [ - u32::from_le_bytes(*array_ref!(block, 0 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 1 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 2 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 3 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 4 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 5 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 6 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 7 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 8 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 9 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 10 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 11 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 12 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 13 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 14 * 4, 4)), - u32::from_le_bytes(*array_ref!(block, 15 * 4, 4)), - ]; +) -> [u32; 16] { + let block_words = crate::platform::words_from_le_bytes_64(block); let mut state = [ - u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4)), - u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4)), + cv[0], + cv[1], + cv[2], + cv[3], + cv[4], + cv[5], + cv[6], + cv[7], IV[0], IV[1], IV[2], @@ -84,6 +70,36 @@ pub fn compress( round(&mut state, &block_words, 5); round(&mut state, &block_words, 6); + state +} + +pub fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, +) { + let state = compress_pre(cv, block, block_len, offset, flags); + + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +pub fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, +) -> [u8; 64] { + let mut state = compress_pre(cv, block, block_len, offset, flags); state[0] ^= state[8]; state[1] ^= state[9]; state[2] ^= state[10]; @@ -92,43 +108,25 @@ pub fn compress( state[5] ^= state[13]; state[6] ^= state[14]; state[7] ^= state[15]; - state[8] ^= u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4)); - state[9] ^= u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4)); - state[10] ^= u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4)); - state[11] ^= u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4)); - state[12] ^= u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4)); - state[13] ^= u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4)); - state[14] ^= u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4)); - state[15] ^= u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4)); - - let mut out = [0; 64]; - out[0 * 4..][..4].copy_from_slice(&state[0].to_le_bytes()); - out[1 * 4..][..4].copy_from_slice(&state[1].to_le_bytes()); - out[2 * 4..][..4].copy_from_slice(&state[2].to_le_bytes()); - out[3 * 4..][..4].copy_from_slice(&state[3].to_le_bytes()); - out[4 * 4..][..4].copy_from_slice(&state[4].to_le_bytes()); - out[5 * 4..][..4].copy_from_slice(&state[5].to_le_bytes()); - out[6 * 4..][..4].copy_from_slice(&state[6].to_le_bytes()); - out[7 * 4..][..4].copy_from_slice(&state[7].to_le_bytes()); - out[8 * 4..][..4].copy_from_slice(&state[8].to_le_bytes()); - out[9 * 4..][..4].copy_from_slice(&state[9].to_le_bytes()); - out[10 * 4..][..4].copy_from_slice(&state[10].to_le_bytes()); - out[11 * 4..][..4].copy_from_slice(&state[11].to_le_bytes()); - out[12 * 4..][..4].copy_from_slice(&state[12].to_le_bytes()); - out[13 * 4..][..4].copy_from_slice(&state[13].to_le_bytes()); - out[14 * 4..][..4].copy_from_slice(&state[14].to_le_bytes()); - out[15 * 4..][..4].copy_from_slice(&state[15].to_le_bytes()); - out + state[8] ^= cv[0]; + state[9] ^= cv[1]; + state[10] ^= cv[2]; + state[11] ^= cv[3]; + state[12] ^= cv[4]; + state[13] ^= cv[5]; + state[14] ^= cv[6]; + state[15] ^= cv[7]; + crate::platform::le_bytes_from_words_64(&state) } pub fn hash1<A: arrayvec::Array<Item = u8>>( input: &A, - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, flags: u8, flags_start: u8, flags_end: u8, - out: &mut [u8; OUT_LEN], + out: &mut CVBytes, ) { debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; @@ -138,23 +136,22 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>( if slice.len() == BLOCK_LEN { block_flags |= flags_end; } - let output = compress( - &cv, + compress_in_place( + &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, offset, block_flags, ); - cv = *array_ref!(output, 0, 32); block_flags = flags; slice = &slice[BLOCK_LEN..]; } - *out = cv; + *out = crate::platform::le_bytes_from_words_32(&cv); } pub fn hash_many<A: arrayvec::Array<Item = u8>>( inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, mut offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -182,12 +179,12 @@ pub mod test { use super::*; // This is basically testing the portable implementation against itself, - // but we do it anyway for completeness. Other implementations will test - // themselves against portable. We also have several tests against the - // reference implementation in test.rs. + // but it also checks that compress_in_place and compress_xof are + // consistent. And there are tests against the reference implementation and + // against hardcoded test vectors elsewhere. #[test] fn test_compress() { - crate::test::test_compress_fn(compress); + crate::test::test_compress_fn(compress_in_place, compress_xof); } // Ditto. diff --git a/src/sse41.rs b/src/sse41.rs index a95d8a2..e45b22e 100644 --- a/src/sse41.rs +++ b/src/sse41.rs @@ -3,7 +3,9 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN}; +use crate::{ + offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; @@ -122,16 +124,16 @@ unsafe fn undiagonalize(row1: &mut __m128i, row3: &mut __m128i, row4: &mut __m12 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(2, 1, 0, 3)); } -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress( - cv: &[u8; 32], +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8, -) -> [u8; 64] { - let row1 = &mut loadu(&cv[0]); - let row2 = &mut loadu(&cv[16]); +) -> [__m128i; 4] { + let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row4 = &mut set4( offset_low(offset), @@ -303,12 +305,37 @@ pub unsafe fn compress( g2(row1, row2, row3, row4, buf); undiagonalize(row1, row3, row4); - *row1 = xor(*row1, *row3); - *row2 = xor(*row2, *row4); - *row3 = xor(*row3, loadu(&cv[0])); - *row4 = xor(*row4, loadu(&cv[16])); + [*row1, *row2, *row3, *row4] +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, +) { + let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, offset, flags); + storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8); +} - core::mem::transmute([*row1, *row2, *row3, *row4]) // x86 is little-endian +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, +) -> [u8; 64] { + let [mut row1, mut row2, mut row3, mut row4] = + compress_pre(cv, block, block_len, offset, flags); + row1 = xor(row1, row3); + row2 = xor(row2, row4); + row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8)); + row4 = xor(row4, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row1, row2, row3, row4]) } #[inline(always)] @@ -500,7 +527,7 @@ unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m128i, _ pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -508,16 +535,15 @@ pub unsafe fn hash4( flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { - let key_words: [u32; 8] = core::mem::transmute(*key); // x86 is little-endian let mut h_vecs = [ - set1(key_words[0]), - set1(key_words[1]), - set1(key_words[2]), - set1(key_words[3]), - set1(key_words[4]), - set1(key_words[5]), - set1(key_words[6]), - set1(key_words[7]), + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), ]; let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas); let mut block_flags = flags | flags_start; @@ -589,12 +615,12 @@ pub unsafe fn hash4( #[target_feature(enable = "sse4.1")] unsafe fn hash1<A: arrayvec::Array<Item = u8>>( input: &A, - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, flags: u8, flags_start: u8, flags_end: u8, - out: &mut [u8; OUT_LEN], + out: &mut CVBytes, ) { debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; @@ -604,24 +630,23 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>( if slice.len() == BLOCK_LEN { block_flags |= flags_end; } - let out = compress( - &cv, + compress_in_place( + &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, offset, block_flags, ); - cv = *array_ref!(out, 0, 32); block_flags = flags; slice = &slice[BLOCK_LEN..]; } - *out = cv; + *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse4.1")] pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( mut inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, mut offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -705,7 +730,7 @@ mod test { if !crate::platform::sse41_detected() { return; } - crate::test::test_compress_fn(compress); + crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] diff --git a/src/test.rs b/src/test.rs index 9d710d2..e5ee4e2 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1,4 +1,4 @@ -use crate::{OffsetDeltas, BLOCK_LEN, CHUNK_LEN, KEY_LEN, OUT_LEN}; +use crate::{CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use arrayref::array_ref; use arrayvec::ArrayVec; use core::usize; @@ -31,7 +31,11 @@ pub const TEST_CASES: &[usize] = &[ pub const TEST_CASES_MAX: usize = 31 * CHUNK_LEN; -pub const TEST_KEY: [u8; crate::KEY_LEN] = *b"whats the Elvish word for friend"; +// There's a test to make sure these two are equal below. +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: CVWords = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; // Paint the input with a repeating byte pattern. We use a cycle length of 251, // because that's the largets prime number less than 256. This makes it @@ -43,8 +47,11 @@ pub fn paint_test_input(buf: &mut [u8]) { } } -type CompressFn = unsafe fn( - cv: &[u8; 32], +type CompressInPlaceFn = + unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8); + +type CompressXofFn = unsafe fn( + cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, @@ -52,8 +59,8 @@ type CompressFn = unsafe fn( ) -> [u8; 64]; // A shared helper function for platform-specific tests. -pub fn test_compress_fn(compress_fn: CompressFn) { - let initial_state = *b"IV for compression tests <('.')>"; +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; let block_len: u8 = 61; let mut block = [0; BLOCK_LEN]; paint_test_input(&mut block[..block_len as usize]); @@ -62,16 +69,21 @@ pub fn test_compress_fn(compress_fn: CompressFn) { let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; let portable_out = - crate::portable::compress(&initial_state, &block, block_len, offset as u64, flags); + crate::portable::compress_xof(&initial_state, &block, block_len, offset as u64, flags); - let test_out = unsafe { compress_fn(&initial_state, &block, block_len, offset as u64, flags) }; + let mut test_state = initial_state; + unsafe { compress_in_place_fn(&mut test_state, &block, block_len, offset as u64, flags) }; + let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); + let test_xof = + unsafe { compress_xof_fn(&initial_state, &block, block_len, offset as u64, flags) }; - assert_eq!(&portable_out[..], &test_out[..]); + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); } type HashManyFn<A> = unsafe fn( inputs: &[&A], - key: &[u8; KEY_LEN], + key: &CVWords, offset: u64, offset_deltas: &OffsetDeltas, flags: u8, @@ -100,7 +112,7 @@ pub fn test_hash_many_fn( let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &chunks, - &TEST_KEY, + &TEST_KEY_WORDS, offset, crate::CHUNK_OFFSET_DELTAS, crate::DERIVE_KEY, @@ -113,7 +125,7 @@ pub fn test_hash_many_fn( unsafe { hash_many_chunks_fn( &chunks[..], - &TEST_KEY, + &TEST_KEY_WORDS, offset, crate::CHUNK_OFFSET_DELTAS, crate::DERIVE_KEY, @@ -139,7 +151,7 @@ pub fn test_hash_many_fn( let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &parents, - &TEST_KEY, + &TEST_KEY_WORDS, 0, crate::PARENT_OFFSET_DELTAS, crate::DERIVE_KEY | crate::PARENT, @@ -152,7 +164,7 @@ pub fn test_hash_many_fn( unsafe { hash_many_parents_fn( &parents[..], - &TEST_KEY, + &TEST_KEY_WORDS, 0, crate::PARENT_OFFSET_DELTAS, crate::DERIVE_KEY | crate::PARENT, @@ -172,6 +184,14 @@ pub fn test_hash_many_fn( } #[test] +fn test_key_bytes_equal_key_words() { + assert_eq!( + TEST_KEY_WORDS, + crate::platform::words_from_le_bytes_32(&TEST_KEY), + ); +} + +#[test] fn test_reference_impl_size() { // Because the Rust compiler optimizes struct layout, it's possible that // some future version of the compiler will produce a different size. If |
