aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2019-12-10 14:20:09 -0500
committerJack O'Connor <[email protected]>2019-12-11 18:05:26 -0500
commit52ea6487f88a0e5cbc2f784f3095539afe6c91e4 (patch)
tree181508c1840c2961e530e982c4525029d79e5685 /src
parentd68882da0d897c93a271a7c0f6d6b9b13d13aa16 (diff)
switch to representing CVs as words for the compression function
The portable implementation was getting slowed down by converting back and forth between words and bytes. I made the corresponding change on the C side first (https://github.com/veorq/BLAKE3-c/commit/12a37be8b50922a358c016ba07f46816a3da4a31), and as part of this commit I'm re-vendoring the C code. I'm also exposing a small FFI interface to C so that blake3_neon.c can link against portable.rs rather than blake3_portable.c, see c_neon.rs.
Diffstat (limited to 'src')
-rw-r--r--src/avx2.rs23
-rw-r--r--src/c/blake3.h12
-rw-r--r--src/c/blake3_avx512.c213
-rw-r--r--src/c/blake3_impl.h70
-rw-r--r--src/c/blake3_neon.c25
-rw-r--r--src/c/blake3_portable.c154
-rw-r--r--src/c_avx512.rs65
-rw-r--r--src/c_neon.rs41
-rw-r--r--src/lib.rs103
-rw-r--r--src/platform.rs119
-rw-r--r--src/portable.rs133
-rw-r--r--src/sse41.rs85
-rw-r--r--src/test.rs48
13 files changed, 549 insertions, 542 deletions
diff --git a/src/avx2.rs b/src/avx2.rs
index 14673c6..471a2dc 100644
--- a/src/avx2.rs
+++ b/src/avx2.rs
@@ -3,7 +3,7 @@ use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{offset_high, offset_low, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN};
use arrayref::{array_mut_ref, mut_array_refs};
pub const DEGREE: usize = 8;
@@ -299,7 +299,7 @@ unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m256i, _
pub unsafe fn hash8(
inputs: &[*const u8; DEGREE],
blocks: usize,
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -307,16 +307,15 @@ pub unsafe fn hash8(
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
- let key_words: [u32; 8] = core::mem::transmute(*key); // x86 is little-endian
let mut h_vecs = [
- set1(key_words[0]),
- set1(key_words[1]),
- set1(key_words[2]),
- set1(key_words[3]),
- set1(key_words[4]),
- set1(key_words[5]),
- set1(key_words[6]),
- set1(key_words[7]),
+ set1(key[0]),
+ set1(key[1]),
+ set1(key[2]),
+ set1(key[3]),
+ set1(key[4]),
+ set1(key[5]),
+ set1(key[6]),
+ set1(key[7]),
];
let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
let mut block_flags = flags | flags_start;
@@ -384,7 +383,7 @@ pub unsafe fn hash8(
#[target_feature(enable = "avx2")]
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
mut inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
mut offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
diff --git a/src/c/blake3.h b/src/c/blake3.h
index 8af3e04..f6e75d8 100644
--- a/src/c/blake3.h
+++ b/src/c/blake3.h
@@ -10,8 +10,7 @@
#define BLAKE3_MAX_SIMD_DEGREE 16
typedef struct {
- uint8_t cv[32];
- uint8_t key[BLAKE3_KEY_LEN];
+ uint32_t cv[8];
uint64_t offset;
uint16_t count;
uint8_t buf[BLAKE3_BLOCK_LEN];
@@ -20,9 +19,10 @@ typedef struct {
} blake3_chunk_state;
typedef struct {
+ uint32_t key[8];
blake3_chunk_state chunk;
- uint8_t subtree_hashes_len;
- uint8_t subtree_hashes[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN];
+ uint8_t cv_stack_len;
+ uint8_t cv_stack[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN];
} blake3_hasher;
void blake3_hasher_init(blake3_hasher *self);
@@ -32,5 +32,5 @@ void blake3_hasher_init_derive_key(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
-void blake3_hasher_finalize(const blake3_hasher *self,
- uint8_t out[BLAKE3_OUT_LEN]);
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+ size_t out_len);
diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c
index 7b0dce1..f30e302 100644
--- a/src/c/blake3_avx512.c
+++ b/src/c/blake3_avx512.c
@@ -74,50 +74,49 @@ INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
* ----------------------------------------------------------------------------
*/
-INLINE void g1(__m128i *row1, __m128i *row2, __m128i *row3, __m128i *row4,
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
- *row1 = add_128(add_128(*row1, m), *row2);
- *row4 = xor_128(*row4, *row1);
- *row4 = rot16_128(*row4);
- *row3 = add_128(*row3, *row4);
- *row2 = xor_128(*row2, *row3);
- *row2 = rot12_128(*row2);
+ *row0 = add_128(add_128(*row0, m), *row1);
+ *row3 = xor_128(*row3, *row0);
+ *row3 = rot16_128(*row3);
+ *row2 = add_128(*row2, *row3);
+ *row1 = xor_128(*row1, *row2);
+ *row1 = rot12_128(*row1);
}
-INLINE void g2(__m128i *row1, __m128i *row2, __m128i *row3, __m128i *row4,
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
- *row1 = add_128(add_128(*row1, m), *row2);
- *row4 = xor_128(*row4, *row1);
- *row4 = rot8_128(*row4);
- *row3 = add_128(*row3, *row4);
- *row2 = xor_128(*row2, *row3);
- *row2 = rot7_128(*row2);
+ *row0 = add_128(add_128(*row0, m), *row1);
+ *row3 = xor_128(*row3, *row0);
+ *row3 = rot8_128(*row3);
+ *row2 = add_128(*row2, *row3);
+ *row1 = xor_128(*row1, *row2);
+ *row1 = rot7_128(*row1);
}
-// Note the optimization here of leaving row2 as the unrotated row, rather than
-// row1. All the message loads below are adjusted to compensate for this. See
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
-INLINE void diagonalize(__m128i *row1, __m128i *row3, __m128i *row4) {
- *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE(2, 1, 0, 3));
- *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE(1, 0, 3, 2));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(0, 3, 2, 1));
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
}
-INLINE void undiagonalize(__m128i *row1, __m128i *row3, __m128i *row4) {
- *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE(0, 3, 2, 1));
- *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE(1, 0, 3, 2));
- *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(2, 1, 0, 3));
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+ *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+ *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+ *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
}
-void blake3_compress_avx512(const uint8_t cv[8],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset, uint8_t flags,
- uint8_t out[64]) {
- __m128i row1 = loadu_128(&cv[0]);
- __m128i row2 = loadu_128(&cv[16]);
- __m128i row3 = set4(IV[0], IV[1], IV[2], IV[3]);
- __m128i row4 = set4(offset_low(offset), offset_high(offset),
- (uint32_t)block_len, (uint32_t)flags);
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset, uint8_t flags) {
+ rows[0] = loadu_128((uint8_t *)&cv[0]);
+ rows[1] = loadu_128((uint8_t *)&cv[4]);
+ rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+ rows[3] = set4(offset_low(offset), offset_high(offset), (uint32_t)block_len,
+ (uint32_t)flags);
__m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
__m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
@@ -129,160 +128,177 @@ void blake3_compress_avx512(const uint8_t cv[8],
// round 1
buf = _mm_castps_si128(_mm_shuffle_ps(
_mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0)));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
buf = _mm_castps_si128(_mm_shuffle_ps(
_mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(3, 1, 3, 1)));
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3, 2, 0, 1));
t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 3, 2));
buf = _mm_blend_epi16(t0, t1, 0xC3);
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_blend_epi16(t0, t1, 0x3C);
buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2, 3, 0, 1));
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
// round 2
t0 = _mm_blend_epi16(m1, m2, 0x0C);
t1 = _mm_slli_si128(m3, 4);
t2 = _mm_blend_epi16(t0, t1, 0xF0);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 0, 2, 0));
t1 = _mm_blend_epi16(m1, m3, 0xC0);
t2 = _mm_blend_epi16(t0, t1, 0xF0);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1));
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_slli_si128(m1, 4);
t1 = _mm_blend_epi16(m2, t0, 0x30);
t2 = _mm_blend_epi16(m0, t1, 0xF0);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_unpackhi_epi32(m0, m1);
t1 = _mm_slli_si128(m3, 4);
t2 = _mm_blend_epi16(t0, t1, 0x0C);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2));
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
// round 3
t0 = _mm_unpackhi_epi32(m2, m3);
t1 = _mm_blend_epi16(m3, m1, 0x0C);
t2 = _mm_blend_epi16(t0, t1, 0x0F);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_unpacklo_epi32(m2, m0);
t1 = _mm_blend_epi16(t0, m0, 0xF0);
t2 = _mm_slli_si128(m3, 8);
buf = _mm_blend_epi16(t1, t2, 0xC0);
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_blend_epi16(m0, m2, 0x3C);
t1 = _mm_srli_si128(m1, 12);
t2 = _mm_blend_epi16(t0, t1, 0x03);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0, 3, 2, 1));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_slli_si128(m3, 4);
t1 = _mm_blend_epi16(m0, m1, 0x33);
t2 = _mm_blend_epi16(t1, t0, 0xC0);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 2, 3, 0));
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
// round 4
t0 = _mm_unpackhi_epi32(m0, m1);
t1 = _mm_unpackhi_epi32(t0, m2);
t2 = _mm_blend_epi16(t1, m3, 0x0C);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_slli_si128(m2, 8);
t1 = _mm_blend_epi16(m3, m0, 0x0C);
t2 = _mm_blend_epi16(t1, t0, 0xC0);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3));
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_blend_epi16(m0, m1, 0x0F);
t1 = _mm_blend_epi16(t0, m3, 0xC0);
buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_alignr_epi8(m0, m1, 4);
buf = _mm_blend_epi16(t0, m2, 0x33);
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
// round 5
t0 = _mm_unpacklo_epi64(m1, m2);
t1 = _mm_unpackhi_epi64(m0, m2);
t2 = _mm_blend_epi16(t0, t1, 0x33);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_unpackhi_epi64(m1, m3);
t1 = _mm_unpacklo_epi64(m0, m1);
buf = _mm_blend_epi16(t0, t1, 0x33);
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_unpackhi_epi64(m3, m1);
t1 = _mm_unpackhi_epi64(m2, m0);
t2 = _mm_blend_epi16(t1, t0, 0x33);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_blend_epi16(m0, m2, 0x03);
t1 = _mm_slli_si128(t0, 8);
t2 = _mm_blend_epi16(t1, m3, 0x0F);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 3, 1));
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
// round 6
t0 = _mm_unpackhi_epi32(m0, m1);
t1 = _mm_unpacklo_epi32(m0, m2);
buf = _mm_unpacklo_epi64(t0, t1);
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_srli_si128(m2, 4);
t1 = _mm_blend_epi16(m0, m3, 0x03);
buf = _mm_blend_epi16(t1, t0, 0x3C);
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_blend_epi16(m1, m0, 0x0C);
t1 = _mm_srli_si128(m3, 4);
t2 = _mm_blend_epi16(t0, t1, 0x30);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_unpacklo_epi64(m2, m1);
t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2, 0, 1, 0));
t2 = _mm_srli_si128(t0, 4);
buf = _mm_blend_epi16(t1, t2, 0x33);
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
// round 7
t0 = _mm_slli_si128(m1, 12);
t1 = _mm_blend_epi16(m0, m3, 0x33);
buf = _mm_blend_epi16(t1, t0, 0xC0);
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_blend_epi16(m3, m2, 0x30);
t1 = _mm_srli_si128(m1, 4);
t2 = _mm_blend_epi16(t0, t1, 0x03);
buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 3, 0));
- g2(&row1, &row2, &row3, &row4, buf);
- diagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ diagonalize(&rows[0], &rows[2], &rows[3]);
t0 = _mm_unpacklo_epi64(m0, m2);
t1 = _mm_srli_si128(m1, 4);
buf =
_mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE(3, 1, 0, 2));
- g1(&row1, &row2, &row3, &row4, buf);
+ g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
t0 = _mm_unpackhi_epi32(m1, m2);
t1 = _mm_unpackhi_epi64(m0, t0);
buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3));
- g2(&row1, &row2, &row3, &row4, buf);
- undiagonalize(&row1, &row3, &row4);
+ g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+ undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags, uint8_t out[64]) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, offset, flags);
+ storeu_128(xor_128(rows[0], rows[2]), &out[0]);
+ storeu_128(xor_128(rows[1], rows[3]), &out[16]);
+ storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
+ storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
+}
- storeu_128(xor_128(row1, row3), &out[0]);
- storeu_128(xor_128(row2, row4), &out[16]);
- storeu_128(xor_128(row3, loadu_128(&cv[0])), &out[32]);
- storeu_128(xor_128(row4, loadu_128(&cv[16])), &out[48]);
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags) {
+ __m128i rows[4];
+ compress_pre(rows, cv, block, block_len, offset, flags);
+ storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
+ storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
}
/*
@@ -461,15 +477,12 @@ INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
}
void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+ const uint32_t key[8], uint64_t offset,
offset_deltas_t offset_deltas, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- uint32_t key_words[8];
- memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian
__m128i h_vecs[8] = {
- set1_128(key_words[0]), set1_128(key_words[1]), set1_128(key_words[2]),
- set1_128(key_words[3]), set1_128(key_words[4]), set1_128(key_words[5]),
- set1_128(key_words[6]), set1_128(key_words[7]),
+ set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+ set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
};
__m128i offset_low_vec, offset_high_vec;
load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -710,15 +723,12 @@ INLINE void load_offsets8(uint64_t offset, const uint64_t deltas[8],
}
void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+ const uint32_t key[8], uint64_t offset,
offset_deltas_t offset_deltas, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- uint32_t key_words[8];
- memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian
__m256i h_vecs[8] = {
- set1_256(key_words[0]), set1_256(key_words[1]), set1_256(key_words[2]),
- set1_256(key_words[3]), set1_256(key_words[4]), set1_256(key_words[5]),
- set1_256(key_words[6]), set1_256(key_words[7]),
+ set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
+ set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
};
__m256i offset_low_vec, offset_high_vec;
load_offsets8(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -1023,16 +1033,13 @@ INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16],
}
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+ const uint32_t key[8], uint64_t offset,
offset_deltas_t offset_deltas, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t *out) {
- uint32_t key_words[8];
- memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian
__m512i h_vecs[8] = {
- set1_512(key_words[0]), set1_512(key_words[1]), set1_512(key_words[2]),
- set1_512(key_words[3]), set1_512(key_words[4]), set1_512(key_words[5]),
- set1_512(key_words[6]), set1_512(key_words[7]),
+ set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
+ set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
};
__m512i offset_low_vec, offset_high_vec;
load_offsets16(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -1107,20 +1114,18 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
*/
INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+ const uint32_t key[8], uint64_t offset,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
- uint8_t cv[32];
+ uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
- uint8_t out[64];
- blake3_compress_avx512(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags,
- out);
- memcpy(cv, out, 32);
+ blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, offset,
+ block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
@@ -1129,7 +1134,7 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
}
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t offset_deltas,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h
index d2e0325..af55d93 100644
--- a/src/c/blake3_impl.h
+++ b/src/c/blake3_impl.h
@@ -32,12 +32,6 @@ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
0x1F83D9ABUL, 0x5BE0CD19UL};
-static const uint8_t IV_BYTES[32] = {
- 0x67, 0xe6, 0x09, 0x6a, 0x85, 0xae, 0x67, 0xbb, 0x72, 0xf3, 0x6e,
- 0x3c, 0x3a, 0xf5, 0x4f, 0xa5, 0x7f, 0x52, 0x0e, 0x51, 0x8c, 0x68,
- 0x05, 0x9b, 0xab, 0xd9, 0x83, 0x1f, 0x19, 0xcd, 0xe0, 0x5b,
-};
-
static const uint8_t MSG_SCHEDULE[7][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
@@ -81,41 +75,71 @@ INLINE uint32_t offset_high(uint64_t offset) {
return (uint32_t)(offset >> 32);
}
+INLINE uint32_t load32(const void *src) {
+ const uint8_t *p = (const uint8_t *)src;
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+ uint32_t key_words[8]) {
+ key_words[0] = load32(&key[0 * 4]);
+ key_words[1] = load32(&key[1 * 4]);
+ key_words[2] = load32(&key[2 * 4]);
+ key_words[3] = load32(&key[3 * 4]);
+ key_words[4] = load32(&key[4 * 4]);
+ key_words[5] = load32(&key[5 * 4]);
+ key_words[6] = load32(&key[6 * 4]);
+ key_words[7] = load32(&key[7 * 4]);
+}
+
// Declarations for implementation-specific functions.
-void blake3_compress_portable(const uint8_t cv[BLAKE3_OUT_LEN],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset, uint8_t flags,
- uint8_t out[64]);
-void blake3_compress_sse41(const uint8_t cv[BLAKE3_OUT_LEN],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset, uint8_t flags,
- uint8_t out[64]);
-void blake3_compress_avx512(const uint8_t cv[BLAKE3_OUT_LEN],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset, uint8_t flags,
- uint8_t out[64]);
+void blake3_compress_in_place_portable(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags);
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags);
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags);
+void blake3_compress_xof_portable(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags, uint8_t out[64]);
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags, uint8_t out[64]);
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t offset,
+ uint8_t flags, uint8_t out[64]);
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t od,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t od, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t *out);
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t od, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t *out);
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t od, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t *out);
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t od, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t *out);
diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c
index 86a99bf..0ffa17e 100644
--- a/src/c/blake3_neon.c
+++ b/src/c/blake3_neon.c
@@ -223,17 +223,12 @@ INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
}
void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+ const uint32_t key[8], uint64_t offset,
offset_deltas_t offset_deltas, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
- uint32_t key_words[8];
- // TODO: This is assuming little-endian. Is there such a thing as NEON on
- // big-endian?
- memcpy(key_words, key, BLAKE3_KEY_LEN);
uint32x4_t h_vecs[8] = {
- set1_128(key_words[0]), set1_128(key_words[1]), set1_128(key_words[2]),
- set1_128(key_words[3]), set1_128(key_words[4]), set1_128(key_words[5]),
- set1_128(key_words[6]), set1_128(key_words[7]),
+ set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+ set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
};
uint32x4_t offset_low_vec, offset_high_vec;
load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -294,10 +289,10 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
*/
INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
- uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+ const uint32_t key[8], uint64_t offset, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end,
uint8_t out[BLAKE3_OUT_LEN]) {
- uint8_t cv[32];
+ uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
@@ -307,10 +302,8 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
// TODO: Implement compress_neon. However note that according to
// https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
// compress_neon might not be any faster than compress_portable.
- uint8_t out[64];
- blake3_compress_portable(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags,
- out);
- memcpy(cv, out, 32);
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, offset,
+ block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
@@ -319,7 +312,7 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
}
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+ size_t blocks, const uint32_t key[8],
uint64_t offset, offset_deltas_t offset_deltas,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
diff --git a/src/c/blake3_portable.c b/src/c/blake3_portable.c
deleted file mode 100644
index 6b58cf4..0000000
--- a/src/c/blake3_portable.c
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "blake3_impl.h"
-#include <string.h>
-
-INLINE uint32_t load32(const void *src) {
- const uint8_t *p = (const uint8_t *)src;
- return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
- ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
-}
-
-INLINE void store32(void *dst, uint32_t w) {
- uint8_t *p = (uint8_t *)dst;
- p[0] = (uint8_t)(w >> 0);
- p[1] = (uint8_t)(w >> 8);
- p[2] = (uint8_t)(w >> 16);
- p[3] = (uint8_t)(w >> 24);
-}
-
-INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
- return (w >> c) | (w << (32 - c));
-}
-
-INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
- uint32_t x, uint32_t y) {
- state[a] = state[a] + state[b] + x;
- state[d] = rotr32(state[d] ^ state[a], 16);
- state[c] = state[c] + state[d];
- state[b] = rotr32(state[b] ^ state[c], 12);
- state[a] = state[a] + state[b] + y;
- state[d] = rotr32(state[d] ^ state[a], 8);
- state[c] = state[c] + state[d];
- state[b] = rotr32(state[b] ^ state[c], 7);
-}
-
-INLINE void round_fn(uint32_t *state, const uint32_t *msg, size_t round) {
- // Select the message schedule based on the round.
- const uint8_t *schedule = MSG_SCHEDULE[round];
-
- // Mix the columns.
- g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
- g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
- g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
- g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
-
- // Mix the rows.
- g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
- g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
- g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
- g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
-}
-
-void blake3_compress_portable(const uint8_t cv[BLAKE3_OUT_LEN],
- const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset, uint8_t flags,
- uint8_t out[64]) {
- uint32_t block_words[16];
- block_words[0] = load32(block + 4 * 0);
- block_words[1] = load32(block + 4 * 1);
- block_words[2] = load32(block + 4 * 2);
- block_words[3] = load32(block + 4 * 3);
- block_words[4] = load32(block + 4 * 4);
- block_words[5] = load32(block + 4 * 5);
- block_words[6] = load32(block + 4 * 6);
- block_words[7] = load32(block + 4 * 7);
- block_words[8] = load32(block + 4 * 8);
- block_words[9] = load32(block + 4 * 9);
- block_words[10] = load32(block + 4 * 10);
- block_words[11] = load32(block + 4 * 11);
- block_words[12] = load32(block + 4 * 12);
- block_words[13] = load32(block + 4 * 13);
- block_words[14] = load32(block + 4 * 14);
- block_words[15] = load32(block + 4 * 15);
-
- uint32_t state[16] = {
- load32(&cv[0 * 4]),
- load32(&cv[1 * 4]),
- load32(&cv[2 * 4]),
- load32(&cv[3 * 4]),
- load32(&cv[4 * 4]),
- load32(&cv[5 * 4]),
- load32(&cv[6 * 4]),
- load32(&cv[7 * 4]),
- IV[0],
- IV[1],
- IV[2],
- IV[3],
- offset_low(offset),
- offset_high(offset),
- (uint32_t)block_len,
- (uint32_t)flags,
- };
-
- round_fn(&state[0], &block_words[0], 0);
- round_fn(&state[0], &block_words[0], 1);
- round_fn(&state[0], &block_words[0], 2);
- round_fn(&state[0], &block_words[0], 3);
- round_fn(&state[0], &block_words[0], 4);
- round_fn(&state[0], &block_words[0], 5);
- round_fn(&state[0], &block_words[0], 6);
-
- store32(&out[0 * 4], state[0] ^ state[8]);
- store32(&out[1 * 4], state[1] ^ state[9]);
- store32(&out[2 * 4], state[2] ^ state[10]);
- store32(&out[3 * 4], state[3] ^ state[11]);
- store32(&out[4 * 4], state[4] ^ state[12]);
- store32(&out[5 * 4], state[5] ^ state[13]);
- store32(&out[6 * 4], state[6] ^ state[14]);
- store32(&out[7 * 4], state[7] ^ state[15]);
- store32(&out[8 * 4], state[8] ^ cv[0]);
- store32(&out[9 * 4], state[9] ^ cv[1]);
- store32(&out[10 * 4], state[10] ^ cv[2]);
- store32(&out[11 * 4], state[11] ^ cv[3]);
- store32(&out[12 * 4], state[12] ^ cv[4]);
- store32(&out[13 * 4], state[13] ^ cv[5]);
- store32(&out[14 * 4], state[14] ^ cv[6]);
- store32(&out[15 * 4], state[15] ^ cv[7]);
-}
-
-INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
- const uint8_t key[BLAKE3_KEY_LEN],
- uint64_t offset, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
- uint8_t out[BLAKE3_OUT_LEN]) {
- uint8_t cv[32];
- memcpy(cv, key, 32);
- uint8_t block_flags = flags | flags_start;
- while (blocks > 0) {
- if (blocks == 1) {
- block_flags |= flags_end;
- }
- uint8_t out[64];
- blake3_compress_portable(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags,
- out);
- memcpy(cv, out, 32);
- input = &input[BLAKE3_BLOCK_LEN];
- blocks -= 1;
- block_flags = flags;
- }
- memcpy(out, cv, 32);
-}
-
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
- size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
- uint64_t offset, offset_deltas_t offset_deltas,
- uint8_t flags, uint8_t flags_start,
- uint8_t flags_end, uint8_t *out) {
- while (num_inputs > 0) {
- hash_one_portable(inputs[0], blocks, key, offset, flags, flags_start,
- flags_end, out);
- inputs += 1;
- num_inputs -= 1;
- offset += offset_deltas[1];
- out = &out[BLAKE3_OUT_LEN];
- }
-}
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
index 4ea2542..0120a2c 100644
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@@ -1,17 +1,29 @@
-use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN};
+use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
+use arrayref::array_ref;
pub const DEGREE: usize = 16;
// Unsafe because this may only be called on platforms supporting AVX-512.
-pub unsafe fn compress(
- cv: &[u8; 32],
+pub unsafe fn compress_in_place(
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) {
+ ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, offset, flags)
+}
+
+// Unsafe because this may only be called on platforms supporting AVX-512.
+pub unsafe fn compress_xof(
+ cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
offset: u64,
flags: u8,
) -> [u8; 64] {
let mut out = [0u8; 64];
- ffi::blake3_compress_avx512(
+ ffi::blake3_compress_xof_avx512(
cv.as_ptr(),
block.as_ptr(),
block_len,
@@ -25,7 +37,7 @@ pub unsafe fn compress(
// Unsafe because this may only be called on platforms supporting AVX-512.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -53,53 +65,26 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
pub mod ffi {
extern "C" {
- pub fn blake3_compress_avx512(
- cv: *const u8,
+ pub fn blake3_compress_in_place_avx512(
+ cv: *mut u32,
block: *const u8,
block_len: u8,
offset: u64,
flags: u8,
- out: *mut u8,
- );
- // hash4/hash8/hash16 are exposed here for benchmarks.
- pub fn blake3_hash4_avx512(
- inputs: *const *const u8,
- blocks: usize,
- key: *const u8,
- offset: u64,
- offset_deltas: *const u64,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
- pub fn blake3_hash8_avx512(
- inputs: *const *const u8,
- blocks: usize,
- key: *const u8,
- offset: u64,
- offset_deltas: *const u64,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
);
- pub fn blake3_hash16_avx512(
- inputs: *const *const u8,
- blocks: usize,
- key: *const u8,
+ pub fn blake3_compress_xof_avx512(
+ cv: *const u32,
+ block: *const u8,
+ block_len: u8,
offset: u64,
- offset_deltas: *const u64,
flags: u8,
- flags_start: u8,
- flags_end: u8,
out: *mut u8,
);
pub fn blake3_hash_many_avx512(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
- key: *const u8,
+ key: *const u32,
offset: u64,
offset_deltas: *const u64,
flags: u8,
@@ -119,7 +104,7 @@ mod test {
if !crate::platform::avx512_detected() {
return;
}
- crate::test::test_compress_fn(compress);
+ crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
diff --git a/src/c_neon.rs b/src/c_neon.rs
index 15e76d2..029bf7e 100644
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@@ -1,11 +1,11 @@
-use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN};
+use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
pub const DEGREE: usize = 4;
// Unsafe because this may only be called on platforms supporting NEON.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -31,25 +31,36 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
)
}
+// blake3_neon.c normally depends on blake3_portable.c, because the NEON
+// implementation only provides 4x compression, and it relies on the portable
+// implementation for 1x compression. However, we expose the portable Rust
+// implementation here instead, to avoid linking in unnecessary code.
+#[no_mangle]
+pub extern "C" fn blake3_compress_in_place_portable(
+ cv: *mut u32,
+ block: *const u8,
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) {
+ unsafe {
+ crate::portable::compress_in_place(
+ &mut *(cv as *mut [u32; 8]),
+ &*(block as *const [u8; 64]),
+ block_len,
+ offset,
+ flags,
+ )
+ }
+}
+
pub mod ffi {
extern "C" {
- // Exposed here for benchmarks.
- pub fn blake3_hash4_neon(
- inputs: *const *const u8,
- blocks: usize,
- key: *const u8,
- offset: u64,
- offset_deltas: *const u64,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: *mut u8,
- );
pub fn blake3_hash_many_neon(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
- key: *const u8,
+ key: *const u32,
offset: u64,
offset_deltas: *const u64,
flags: u8,
diff --git a/src/lib.rs b/src/lib.rs
index 9edcd78..8ccf7c5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -40,15 +40,18 @@ pub const BLOCK_LEN: usize = 64;
#[doc(hidden)]
pub const CHUNK_LEN: usize = 2048;
-const IV: &[u32; 8] = &[
+// While iterating the compression function within a chunk, the CV is
+// represented as words, to avoid doing two extra endianness conversions for
+// each compression in the portable implementation. But the hash_many interface
+// needs to hash both input bytes and parent nodes, so its better for its
+// output CVs to be represented as bytes.
+type CVWords = [u32; 8];
+type CVBytes = [u8; 32]; // little-endian
+
+const IV: &CVWords = &[
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
];
-const IV_BYTES: &[u8; 32] = &[
- 0x67, 0xe6, 0x09, 0x6a, 0x85, 0xae, 0x67, 0xbb, 0x72, 0xf3, 0x6e, 0x3c, 0x3a, 0xf5, 0x4f, 0xa5,
- 0x7f, 0x52, 0x0e, 0x51, 0x8c, 0x68, 0x05, 0x9b, 0xab, 0xd9, 0x83, 0x1f, 0x19, 0xcd, 0xe0, 0x5b,
-];
-
const MSG_SCHEDULE: [[usize; 16]; 7] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
@@ -173,7 +176,7 @@ impl fmt::Debug for Hash {
// setting the ROOT flag, any number of final output bytes. The Output struct
// captures the state just prior to choosing between those two possibilities.
struct Output {
- input_chaining_value: [u8; 32],
+ input_chaining_value: CVWords,
block: [u8; 64],
block_len: u8,
offset: u64,
@@ -182,34 +185,31 @@ struct Output {
}
impl Output {
- fn chaining_value(&self) -> [u8; 32] {
- let out = self.platform.compress(
- &self.input_chaining_value,
+ fn chaining_value(&self) -> CVBytes {
+ let mut cv = self.input_chaining_value;
+ self.platform.compress_in_place(
+ &mut cv,
&self.block,
self.block_len,
self.offset,
self.flags,
);
- *array_ref!(out, 0, 32)
+ platform::le_bytes_from_words_32(&cv)
}
fn root_hash(&self) -> Hash {
debug_assert_eq!(self.offset, 0);
- let out = self.platform.compress(
- &self.input_chaining_value,
- &self.block,
- self.block_len,
- 0,
- self.flags | ROOT,
- );
- Hash(*array_ref!(out, 0, 32))
+ let mut cv = self.input_chaining_value;
+ self.platform
+ .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
+ Hash(platform::le_bytes_from_words_32(&cv))
}
fn root_output_bytes(&self, out_slice: &mut [u8]) {
debug_assert_eq!(self.offset, 0);
let mut offset = 0;
for out_block in out_slice.chunks_mut(2 * OUT_LEN) {
- let out_bytes = self.platform.compress(
+ let out_bytes = self.platform.compress_xof(
&self.input_chaining_value,
&self.block,
self.block_len,
@@ -224,7 +224,7 @@ impl Output {
#[derive(Clone)]
struct ChunkState {
- cv: [u8; 32],
+ cv: CVWords,
offset: u64,
buf: [u8; BLOCK_LEN],
buf_len: u8,
@@ -234,7 +234,7 @@ struct ChunkState {
}
impl ChunkState {
- fn new(key: &[u8; 32], offset: u64, flags: u8, platform: Platform) -> Self {
+ fn new(key: &CVWords, offset: u64, flags: u8, platform: Platform) -> Self {
Self {
cv: *key,
offset,
@@ -246,7 +246,7 @@ impl ChunkState {
}
}
- fn reset(&mut self, key: &[u8; KEY_LEN], new_offset: u64) {
+ fn reset(&mut self, key: &CVWords, new_offset: u64) {
debug_assert_eq!(new_offset % CHUNK_LEN as u64, 0);
self.cv = *key;
self.offset = new_offset;
@@ -283,14 +283,13 @@ impl ChunkState {
if !input.is_empty() {
debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
let block_flags = self.flags | self.start_flag(); // borrowck
- let output = self.platform.compress(
- &self.cv,
+ self.platform.compress_in_place(
+ &mut self.cv,
&self.buf,
BLOCK_LEN as u8,
self.offset,
block_flags,
);
- self.cv = *array_ref!(output, 0, 32);
self.buf_len = 0;
self.buf = [0; BLOCK_LEN];
self.blocks_compressed += 1;
@@ -300,14 +299,13 @@ impl ChunkState {
while input.len() > BLOCK_LEN {
debug_assert_eq!(self.buf_len, 0);
let block_flags = self.flags | self.start_flag(); // borrowck
- let output = self.platform.compress(
- &self.cv,
+ self.platform.compress_in_place(
+ &mut self.cv,
array_ref!(input, 0, BLOCK_LEN),
BLOCK_LEN as u8,
self.offset,
block_flags,
);
- self.cv = *array_ref!(output, 0, 32);
self.blocks_compressed += 1;
input = &input[BLOCK_LEN..];
}
@@ -400,7 +398,7 @@ where
// those cases use a different codepath.
fn compress_chunks_parallel(
input: &[u8],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
flags: u8,
platform: Platform,
@@ -448,7 +446,7 @@ fn compress_chunks_parallel(
// never empty; those cases use a different codepath.
fn compress_parents_parallel(
child_chaining_values: &[u8],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
flags: u8,
platform: Platform,
out: &mut [u8],
@@ -501,7 +499,7 @@ fn compress_parents_parallel(
// codepath.
fn compress_subtree_wide(
input: &[u8],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
flags: u8,
platform: Platform,
@@ -573,7 +571,7 @@ fn compress_subtree_wide(
// chunk or less. That's a different codepath.
fn compress_subtree_to_parent_node(
input: &[u8],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
flags: u8,
platform: Platform,
@@ -597,7 +595,7 @@ fn compress_subtree_to_parent_node(
// Hash a complete input all at once. Unlike compress_subtree_wide() and
// compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-fn hash_all_at_once(input: &[u8], key: &[u8; KEY_LEN], flags: u8) -> Output {
+fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
let platform = Platform::detect();
// If the whole subtree is one chunk, hash it directly with a ChunkState.
@@ -621,23 +619,25 @@ fn hash_all_at_once(input: &[u8], key: &[u8; KEY_LEN], flags: u8) -> Output {
/// The default hash function.
pub fn hash(input: &[u8]) -> Hash {
- hash_all_at_once(input, IV_BYTES, 0).root_hash()
+ hash_all_at_once(input, IV, 0).root_hash()
}
/// The keyed hash function.
pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
- hash_all_at_once(input, key, KEYED_HASH).root_hash()
+ let key_words = platform::words_from_le_bytes_32(key);
+ hash_all_at_once(input, &key_words, KEYED_HASH).root_hash()
}
/// The key derivation function.
pub fn derive_key(key: &[u8; KEY_LEN], context: &[u8]) -> Hash {
- hash_all_at_once(context, key, DERIVE_KEY).root_hash()
+ let key_words = platform::words_from_le_bytes_32(key);
+ hash_all_at_once(context, &key_words, DERIVE_KEY).root_hash()
}
fn parent_node_output(
- left_child: &[u8; 32],
- right_child: &[u8; 32],
- key: &[u8; KEY_LEN],
+ left_child: &CVBytes,
+ right_child: &CVBytes,
+ key: &CVWords,
flags: u8,
platform: Platform,
) -> Output {
@@ -658,14 +658,14 @@ fn parent_node_output(
/// support for extendable output.
#[derive(Clone)]
pub struct Hasher {
- key: [u8; KEY_LEN],
+ key: CVWords,
chunk_state: ChunkState,
// 2^53 * 2048 = 2^64
- cv_stack: ArrayVec<[[u8; OUT_LEN]; 53]>,
+ cv_stack: ArrayVec<[CVBytes; 53]>,
}
impl Hasher {
- fn new_internal(key: &[u8; 32], flags: u8) -> Self {
+ fn new_internal(key: &CVWords, flags: u8) -> Self {
Self {
key: *key,
chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
@@ -675,12 +675,13 @@ impl Hasher {
/// Construct a new `Hasher` for the regular hash function.
pub fn new() -> Self {
- Self::new_internal(IV_BYTES, 0)
+ Self::new_internal(IV, 0)
}
/// Construct a new `Hasher` for the keyed hash function.
pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
- Self::new_internal(key, KEYED_HASH)
+ let key_words = platform::words_from_le_bytes_32(key);
+ Self::new_internal(&key_words, KEYED_HASH)
}
/// Construct a new `Hasher` for the key derivation function.
@@ -691,7 +692,8 @@ impl Hasher {
///
/// [`derive_key`]: fn.derive_key.html
pub fn new_derive_key(key: &[u8; KEY_LEN]) -> Self {
- Self::new_internal(key, DERIVE_KEY)
+ let key_words = platform::words_from_le_bytes_32(key);
+ Self::new_internal(&key_words, DERIVE_KEY)
}
/// The total number of input bytes so far.
@@ -705,19 +707,18 @@ impl Hasher {
while self.cv_stack.len() > post_merge_stack_len {
let right_child = self.cv_stack.pop().unwrap();
let left_child = self.cv_stack.pop().unwrap();
- let parent_cv = parent_node_output(
+ let parent_output = parent_node_output(
&left_child,
&right_child,
&self.key,
self.chunk_state.flags,
self.chunk_state.platform,
- )
- .chaining_value();
- self.cv_stack.push(parent_cv);
+ );
+ self.cv_stack.push(parent_output.chaining_value());
}
}
- fn push_cv(&mut self, new_cv: &[u8; 32], offset: u64) {
+ fn push_cv(&mut self, new_cv: &CVBytes, offset: u64) {
// In reference_impl.rs, we merge the new CV with existing CVs from the
// stack before pushing it. We can do that because we know more input
// is coming, so we know none of the merges are root.
diff --git a/src/platform.rs b/src/platform.rs
index 4621fc5..b42f7b9 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,4 +1,5 @@
-use crate::{portable, OffsetDeltas, BLOCK_LEN, KEY_LEN};
+use crate::{portable, CVWords, OffsetDeltas, BLOCK_LEN};
+use arrayref::{array_mut_ref, array_ref};
#[cfg(feature = "c_avx512")]
use crate::c_avx512;
@@ -91,27 +92,55 @@ impl Platform {
degree
}
- pub(crate) fn compress(
+ pub(crate) fn compress_in_place(
&self,
- cv: &[u8; 32],
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+ ) {
+ match self {
+ Platform::Portable => portable::compress_in_place(cv, block, block_len, offset, flags),
+ // Safe because detect() checked for platform support.
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ Platform::SSE41 | Platform::AVX2 => unsafe {
+ sse41::compress_in_place(cv, block, block_len, offset, flags)
+ },
+ // Safe because detect() checked for platform support.
+ #[cfg(feature = "c_avx512")]
+ Platform::AVX512 => unsafe {
+ c_avx512::compress_in_place(cv, block, block_len, offset, flags)
+ },
+ // No NEON compress_in_place() implementation yet.
+ #[cfg(feature = "c_neon")]
+ Platform::NEON => portable::compress_in_place(cv, block, block_len, offset, flags),
+ }
+ }
+
+ pub(crate) fn compress_xof(
+ &self,
+ cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
offset: u64,
flags: u8,
) -> [u8; 64] {
match self {
- Platform::Portable => portable::compress(cv, block, block_len, offset, flags),
+ Platform::Portable => portable::compress_xof(cv, block, block_len, offset, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
- sse41::compress(cv, block, block_len, offset, flags)
+ sse41::compress_xof(cv, block, block_len, offset, flags)
},
// Safe because detect() checked for platform support.
#[cfg(feature = "c_avx512")]
- Platform::AVX512 => unsafe { c_avx512::compress(cv, block, block_len, offset, flags) },
- // No NEON compress() implementation yet.
+ Platform::AVX512 => unsafe {
+ c_avx512::compress_xof(cv, block, block_len, offset, flags)
+ },
+ // No NEON compress_xof() implementation yet.
#[cfg(feature = "c_neon")]
- Platform::NEON => portable::compress(cv, block, block_len, offset, flags),
+ Platform::NEON => portable::compress_xof(cv, block, block_len, offset, flags),
}
}
@@ -128,7 +157,7 @@ impl Platform {
pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>(
&self,
inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -262,3 +291,75 @@ pub fn sse41_detected() -> bool {
}
false
}
+
+#[inline(always)]
+pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
+ let mut out = [0; 8];
+ out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
+ out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
+ out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
+ out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
+ out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
+ out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
+ out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
+ out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
+ out
+}
+
+#[inline(always)]
+pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
+ let mut out = [0; 16];
+ out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
+ out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
+ out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
+ out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
+ out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
+ out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
+ out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
+ out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
+ out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
+ out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
+ out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
+ out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
+ out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
+ out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
+ out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
+ out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
+ out
+}
+
+#[inline(always)]
+pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
+ let mut out = [0; 32];
+ *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+ *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+ *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+ *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+ *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+ *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+ *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+ *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+ out
+}
+
+#[inline(always)]
+pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
+ let mut out = [0; 64];
+ *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+ *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+ *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+ *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+ *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+ *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+ *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+ *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+ *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
+ *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
+ *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
+ *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
+ *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
+ *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
+ *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
+ *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
+ out
+}
diff --git a/src/portable.rs b/src/portable.rs
index b07c46a..fa0e17d 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -1,4 +1,6 @@
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+ offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
use arrayref::{array_mut_ref, array_ref};
#[inline(always)]
@@ -31,41 +33,25 @@ fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
}
-pub fn compress(
- cv: &[u8; 32],
+#[inline(always)]
+fn compress_pre(
+ cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
offset: u64,
flags: u8,
-) -> [u8; 64] {
- let block_words = [
- u32::from_le_bytes(*array_ref!(block, 0 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 1 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 2 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 3 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 4 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 5 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 6 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 7 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 8 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 9 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 10 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 11 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 12 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 13 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 14 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 15 * 4, 4)),
- ];
+) -> [u32; 16] {
+ let block_words = crate::platform::words_from_le_bytes_64(block);
let mut state = [
- u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4)),
+ cv[0],
+ cv[1],
+ cv[2],
+ cv[3],
+ cv[4],
+ cv[5],
+ cv[6],
+ cv[7],
IV[0],
IV[1],
IV[2],
@@ -84,6 +70,36 @@ pub fn compress(
round(&mut state, &block_words, 5);
round(&mut state, &block_words, 6);
+ state
+}
+
+pub fn compress_in_place(
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) {
+ let state = compress_pre(cv, block, block_len, offset, flags);
+
+ cv[0] = state[0] ^ state[8];
+ cv[1] = state[1] ^ state[9];
+ cv[2] = state[2] ^ state[10];
+ cv[3] = state[3] ^ state[11];
+ cv[4] = state[4] ^ state[12];
+ cv[5] = state[5] ^ state[13];
+ cv[6] = state[6] ^ state[14];
+ cv[7] = state[7] ^ state[15];
+}
+
+pub fn compress_xof(
+ cv: &CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) -> [u8; 64] {
+ let mut state = compress_pre(cv, block, block_len, offset, flags);
state[0] ^= state[8];
state[1] ^= state[9];
state[2] ^= state[10];
@@ -92,43 +108,25 @@ pub fn compress(
state[5] ^= state[13];
state[6] ^= state[14];
state[7] ^= state[15];
- state[8] ^= u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4));
- state[9] ^= u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4));
- state[10] ^= u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4));
- state[11] ^= u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4));
- state[12] ^= u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4));
- state[13] ^= u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4));
- state[14] ^= u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4));
- state[15] ^= u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4));
-
- let mut out = [0; 64];
- out[0 * 4..][..4].copy_from_slice(&state[0].to_le_bytes());
- out[1 * 4..][..4].copy_from_slice(&state[1].to_le_bytes());
- out[2 * 4..][..4].copy_from_slice(&state[2].to_le_bytes());
- out[3 * 4..][..4].copy_from_slice(&state[3].to_le_bytes());
- out[4 * 4..][..4].copy_from_slice(&state[4].to_le_bytes());
- out[5 * 4..][..4].copy_from_slice(&state[5].to_le_bytes());
- out[6 * 4..][..4].copy_from_slice(&state[6].to_le_bytes());
- out[7 * 4..][..4].copy_from_slice(&state[7].to_le_bytes());
- out[8 * 4..][..4].copy_from_slice(&state[8].to_le_bytes());
- out[9 * 4..][..4].copy_from_slice(&state[9].to_le_bytes());
- out[10 * 4..][..4].copy_from_slice(&state[10].to_le_bytes());
- out[11 * 4..][..4].copy_from_slice(&state[11].to_le_bytes());
- out[12 * 4..][..4].copy_from_slice(&state[12].to_le_bytes());
- out[13 * 4..][..4].copy_from_slice(&state[13].to_le_bytes());
- out[14 * 4..][..4].copy_from_slice(&state[14].to_le_bytes());
- out[15 * 4..][..4].copy_from_slice(&state[15].to_le_bytes());
- out
+ state[8] ^= cv[0];
+ state[9] ^= cv[1];
+ state[10] ^= cv[2];
+ state[11] ^= cv[3];
+ state[12] ^= cv[4];
+ state[13] ^= cv[5];
+ state[14] ^= cv[6];
+ state[15] ^= cv[7];
+ crate::platform::le_bytes_from_words_64(&state)
}
pub fn hash1<A: arrayvec::Array<Item = u8>>(
input: &A,
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
- out: &mut [u8; OUT_LEN],
+ out: &mut CVBytes,
) {
debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
@@ -138,23 +136,22 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
- let output = compress(
- &cv,
+ compress_in_place(
+ &mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
offset,
block_flags,
);
- cv = *array_ref!(output, 0, 32);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
- *out = cv;
+ *out = crate::platform::le_bytes_from_words_32(&cv);
}
pub fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
mut offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -182,12 +179,12 @@ pub mod test {
use super::*;
// This is basically testing the portable implementation against itself,
- // but we do it anyway for completeness. Other implementations will test
- // themselves against portable. We also have several tests against the
- // reference implementation in test.rs.
+ // but it also checks that compress_in_place and compress_xof are
+ // consistent. And there are tests against the reference implementation and
+ // against hardcoded test vectors elsewhere.
#[test]
fn test_compress() {
- crate::test::test_compress_fn(compress);
+ crate::test::test_compress_fn(compress_in_place, compress_xof);
}
// Ditto.
diff --git a/src/sse41.rs b/src/sse41.rs
index a95d8a2..e45b22e 100644
--- a/src/sse41.rs
+++ b/src/sse41.rs
@@ -3,7 +3,9 @@ use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+ offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
pub const DEGREE: usize = 4;
@@ -122,16 +124,16 @@ unsafe fn undiagonalize(row1: &mut __m128i, row3: &mut __m128i, row4: &mut __m12
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(2, 1, 0, 3));
}
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn compress(
- cv: &[u8; 32],
+#[inline(always)]
+unsafe fn compress_pre(
+ cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
offset: u64,
flags: u8,
-) -> [u8; 64] {
- let row1 = &mut loadu(&cv[0]);
- let row2 = &mut loadu(&cv[16]);
+) -> [__m128i; 4] {
+ let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8);
+ let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8);
let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
let row4 = &mut set4(
offset_low(offset),
@@ -303,12 +305,37 @@ pub unsafe fn compress(
g2(row1, row2, row3, row4, buf);
undiagonalize(row1, row3, row4);
- *row1 = xor(*row1, *row3);
- *row2 = xor(*row2, *row4);
- *row3 = xor(*row3, loadu(&cv[0]));
- *row4 = xor(*row4, loadu(&cv[16]));
+ [*row1, *row2, *row3, *row4]
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress_in_place(
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) {
+ let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, offset, flags);
+ storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8);
+ storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8);
+}
- core::mem::transmute([*row1, *row2, *row3, *row4]) // x86 is little-endian
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress_xof(
+ cv: &CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) -> [u8; 64] {
+ let [mut row1, mut row2, mut row3, mut row4] =
+ compress_pre(cv, block, block_len, offset, flags);
+ row1 = xor(row1, row3);
+ row2 = xor(row2, row4);
+ row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8));
+ row4 = xor(row4, loadu(cv.as_ptr().add(4) as *const u8));
+ core::mem::transmute([row1, row2, row3, row4])
}
#[inline(always)]
@@ -500,7 +527,7 @@ unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m128i, _
pub unsafe fn hash4(
inputs: &[*const u8; DEGREE],
blocks: usize,
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -508,16 +535,15 @@ pub unsafe fn hash4(
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
- let key_words: [u32; 8] = core::mem::transmute(*key); // x86 is little-endian
let mut h_vecs = [
- set1(key_words[0]),
- set1(key_words[1]),
- set1(key_words[2]),
- set1(key_words[3]),
- set1(key_words[4]),
- set1(key_words[5]),
- set1(key_words[6]),
- set1(key_words[7]),
+ set1(key[0]),
+ set1(key[1]),
+ set1(key[2]),
+ set1(key[3]),
+ set1(key[4]),
+ set1(key[5]),
+ set1(key[6]),
+ set1(key[7]),
];
let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
let mut block_flags = flags | flags_start;
@@ -589,12 +615,12 @@ pub unsafe fn hash4(
#[target_feature(enable = "sse4.1")]
unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
input: &A,
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
- out: &mut [u8; OUT_LEN],
+ out: &mut CVBytes,
) {
debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
@@ -604,24 +630,23 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
- let out = compress(
- &cv,
+ compress_in_place(
+ &mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
offset,
block_flags,
);
- cv = *array_ref!(out, 0, 32);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
- *out = cv;
+ *out = core::mem::transmute(cv); // x86 is little-endian
}
#[target_feature(enable = "sse4.1")]
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
mut inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
mut offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -705,7 +730,7 @@ mod test {
if !crate::platform::sse41_detected() {
return;
}
- crate::test::test_compress_fn(compress);
+ crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
diff --git a/src/test.rs b/src/test.rs
index 9d710d2..e5ee4e2 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1,4 +1,4 @@
-use crate::{OffsetDeltas, BLOCK_LEN, CHUNK_LEN, KEY_LEN, OUT_LEN};
+use crate::{CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
use arrayref::array_ref;
use arrayvec::ArrayVec;
use core::usize;
@@ -31,7 +31,11 @@ pub const TEST_CASES: &[usize] = &[
pub const TEST_CASES_MAX: usize = 31 * CHUNK_LEN;
-pub const TEST_KEY: [u8; crate::KEY_LEN] = *b"whats the Elvish word for friend";
+// There's a test to make sure these two are equal below.
+pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
+pub const TEST_KEY_WORDS: CVWords = [
+ 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
+];
// Paint the input with a repeating byte pattern. We use a cycle length of 251,
// because that's the largets prime number less than 256. This makes it
@@ -43,8 +47,11 @@ pub fn paint_test_input(buf: &mut [u8]) {
}
}
-type CompressFn = unsafe fn(
- cv: &[u8; 32],
+type CompressInPlaceFn =
+ unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8);
+
+type CompressXofFn = unsafe fn(
+ cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
offset: u64,
@@ -52,8 +59,8 @@ type CompressFn = unsafe fn(
) -> [u8; 64];
// A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_fn: CompressFn) {
- let initial_state = *b"IV for compression tests <('.')>";
+pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
+ let initial_state = TEST_KEY_WORDS;
let block_len: u8 = 61;
let mut block = [0; BLOCK_LEN];
paint_test_input(&mut block[..block_len as usize]);
@@ -62,16 +69,21 @@ pub fn test_compress_fn(compress_fn: CompressFn) {
let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
let portable_out =
- crate::portable::compress(&initial_state, &block, block_len, offset as u64, flags);
+ crate::portable::compress_xof(&initial_state, &block, block_len, offset as u64, flags);
- let test_out = unsafe { compress_fn(&initial_state, &block, block_len, offset as u64, flags) };
+ let mut test_state = initial_state;
+ unsafe { compress_in_place_fn(&mut test_state, &block, block_len, offset as u64, flags) };
+ let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
+ let test_xof =
+ unsafe { compress_xof_fn(&initial_state, &block, block_len, offset as u64, flags) };
- assert_eq!(&portable_out[..], &test_out[..]);
+ assert_eq!(&portable_out[..32], &test_state_bytes[..]);
+ assert_eq!(&portable_out[..], &test_xof[..]);
}
type HashManyFn<A> = unsafe fn(
inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -100,7 +112,7 @@ pub fn test_hash_many_fn(
let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
crate::portable::hash_many(
&chunks,
- &TEST_KEY,
+ &TEST_KEY_WORDS,
offset,
crate::CHUNK_OFFSET_DELTAS,
crate::DERIVE_KEY,
@@ -113,7 +125,7 @@ pub fn test_hash_many_fn(
unsafe {
hash_many_chunks_fn(
&chunks[..],
- &TEST_KEY,
+ &TEST_KEY_WORDS,
offset,
crate::CHUNK_OFFSET_DELTAS,
crate::DERIVE_KEY,
@@ -139,7 +151,7 @@ pub fn test_hash_many_fn(
let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
crate::portable::hash_many(
&parents,
- &TEST_KEY,
+ &TEST_KEY_WORDS,
0,
crate::PARENT_OFFSET_DELTAS,
crate::DERIVE_KEY | crate::PARENT,
@@ -152,7 +164,7 @@ pub fn test_hash_many_fn(
unsafe {
hash_many_parents_fn(
&parents[..],
- &TEST_KEY,
+ &TEST_KEY_WORDS,
0,
crate::PARENT_OFFSET_DELTAS,
crate::DERIVE_KEY | crate::PARENT,
@@ -172,6 +184,14 @@ pub fn test_hash_many_fn(
}
#[test]
+fn test_key_bytes_equal_key_words() {
+ assert_eq!(
+ TEST_KEY_WORDS,
+ crate::platform::words_from_le_bytes_32(&TEST_KEY),
+ );
+}
+
+#[test]
fn test_reference_impl_size() {
// Because the Rust compiler optimizes struct layout, it's possible that
// some future version of the compiler will produce a different size. If