aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2019-12-12 18:21:17 -0500
committerJack O'Connor <[email protected]>2019-12-12 21:41:30 -0500
commitb5f1e925f79589ade494289d730c1fb5c25a09b1 (patch)
treea8608132b339b9f2c55bec46bf18d297cd0216b6 /src
parenta5cc3b28679d724020b8810ea9aecea9f54a058f (diff)
rename "offset" to "counter" and always increment it by 1
This is simpler than sometimes incrementing by CHUNK_LEN and other times incrementing by BLOCK_LEN.
Diffstat (limited to 'src')
-rw-r--r--src/avx2.rs65
-rw-r--r--src/c/blake3.h4
-rw-r--r--src/c/blake3_avx512.c143
-rw-r--r--src/c/blake3_impl.h57
-rw-r--r--src/c/blake3_neon.c56
-rw-r--r--src/c_avx512.rs27
-rw-r--r--src/c_neon.rs18
-rw-r--r--src/lib.rs180
-rw-r--r--src/platform.rs46
-rw-r--r--src/portable.rs31
-rw-r--r--src/sse41.rs68
-rw-r--r--src/test.rs44
12 files changed, 376 insertions, 363 deletions
diff --git a/src/avx2.rs b/src/avx2.rs
index 471a2dc..3424a83 100644
--- a/src/avx2.rs
+++ b/src/avx2.rs
@@ -3,7 +3,9 @@ use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
-use crate::{offset_high, offset_low, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+ counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
use arrayref::{array_mut_ref, mut_array_refs};
pub const DEGREE: usize = 8;
@@ -270,27 +272,28 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize)
}
#[inline(always)]
-unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m256i, __m256i) {
+unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) {
+ let mask = if increment_counter.yes() { !0 } else { 0 };
(
set8(
- offset_low(offset + offset_deltas[0]),
- offset_low(offset + offset_deltas[1]),
- offset_low(offset + offset_deltas[2]),
- offset_low(offset + offset_deltas[3]),
- offset_low(offset + offset_deltas[4]),
- offset_low(offset + offset_deltas[5]),
- offset_low(offset + offset_deltas[6]),
- offset_low(offset + offset_deltas[7]),
+ counter_low(counter + (mask & 0)),
+ counter_low(counter + (mask & 1)),
+ counter_low(counter + (mask & 2)),
+ counter_low(counter + (mask & 3)),
+ counter_low(counter + (mask & 4)),
+ counter_low(counter + (mask & 5)),
+ counter_low(counter + (mask & 6)),
+ counter_low(counter + (mask & 7)),
),
set8(
- offset_high(offset + offset_deltas[0]),
- offset_high(offset + offset_deltas[1]),
- offset_high(offset + offset_deltas[2]),
- offset_high(offset + offset_deltas[3]),
- offset_high(offset + offset_deltas[4]),
- offset_high(offset + offset_deltas[5]),
- offset_high(offset + offset_deltas[6]),
- offset_high(offset + offset_deltas[7]),
+ counter_high(counter + (mask & 0)),
+ counter_high(counter + (mask & 1)),
+ counter_high(counter + (mask & 2)),
+ counter_high(counter + (mask & 3)),
+ counter_high(counter + (mask & 4)),
+ counter_high(counter + (mask & 5)),
+ counter_high(counter + (mask & 6)),
+ counter_high(counter + (mask & 7)),
),
)
}
@@ -300,8 +303,8 @@ pub unsafe fn hash8(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
- offset: u64,
- offset_deltas: &OffsetDeltas,
+ counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -317,7 +320,7 @@ pub unsafe fn hash8(
set1(key[6]),
set1(key[7]),
];
- let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
+ let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
@@ -345,8 +348,8 @@ pub unsafe fn hash8(
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
- offset_low_vec,
- offset_high_vec,
+ counter_low_vec,
+ counter_high_vec,
block_len_vec,
block_flags_vec,
];
@@ -384,8 +387,8 @@ pub unsafe fn hash8(
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
mut inputs: &[&A],
key: &CVWords,
- mut offset: u64,
- offset_deltas: &OffsetDeltas,
+ mut counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -401,22 +404,24 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
input_ptrs,
blocks,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
+ if increment_counter.yes() {
+ counter += DEGREE as u64;
+ }
inputs = &inputs[DEGREE..];
- offset += DEGREE as u64 * offset_deltas[1];
out = &mut out[DEGREE * OUT_LEN..];
}
crate::sse41::hash_many(
inputs,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
diff --git a/src/c/blake3.h b/src/c/blake3.h
index 4d4a409..4c5624c 100644
--- a/src/c/blake3.h
+++ b/src/c/blake3.h
@@ -11,10 +11,10 @@
typedef struct {
uint32_t cv[8];
- uint64_t offset;
- uint16_t count;
+ uint64_t chunk_counter;
uint8_t buf[BLAKE3_BLOCK_LEN];
uint8_t buf_len;
+ uint8_t blocks_compressed;
uint8_t flags;
} blake3_chunk_state;
diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c
index f30e302..2c8657c 100644
--- a/src/c/blake3_avx512.c
+++ b/src/c/blake3_avx512.c
@@ -111,12 +111,12 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset, uint8_t flags) {
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
rows[0] = loadu_128((uint8_t *)&cv[0]);
rows[1] = loadu_128((uint8_t *)&cv[4]);
rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
- rows[3] = set4(offset_low(offset), offset_high(offset), (uint32_t)block_len,
- (uint32_t)flags);
+ rows[3] = set4(counter_low(counter), counter_high(counter),
+ (uint32_t)block_len, (uint32_t)flags);
__m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
__m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
@@ -281,10 +281,10 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
void blake3_compress_xof_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]) {
__m128i rows[4];
- compress_pre(rows, cv, block, block_len, offset, flags);
+ compress_pre(rows, cv, block, block_len, counter, flags);
storeu_128(xor_128(rows[0], rows[2]), &out[0]);
storeu_128(xor_128(rows[1], rows[3]), &out[16]);
storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
@@ -293,10 +293,10 @@ void blake3_compress_xof_avx512(const uint32_t cv[8],
void blake3_compress_in_place_avx512(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags) {
__m128i rows[4];
- compress_pre(rows, cv, block, block_len, offset, flags);
+ compress_pre(rows, cv, block, block_len, counter, flags);
storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
}
@@ -468,24 +468,29 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
transpose_vecs_128(&out[12]);
}
-INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
- __m128i *out_lo, __m128i *out_hi) {
- __m256i a = _mm256_add_epi64(_mm256_set1_epi64x((int64_t)offset),
- _mm256_loadu_si256((const __m256i *)deltas));
- *out_lo = _mm256_cvtepi64_epi32(a);
- *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(a, 32));
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+ __m128i *out_lo, __m128i *out_hi) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ __m256i mask_vec = _mm256_set1_epi64x(mask);
+ __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
+ deltas = _mm256_and_si256(mask_vec, deltas);
+ __m256i counters =
+ _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
+ *out_lo = _mm256_cvtepi64_epi32(counters);
+ *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
}
void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t offset,
- offset_deltas_t offset_deltas, uint8_t flags,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m128i h_vecs[8] = {
set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
};
- __m128i offset_low_vec, offset_high_vec;
- load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+ __m128i counter_low_vec, counter_high_vec;
+ load_counters4(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
@@ -498,10 +503,10 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m128i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
- offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec,
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn4(v, msg_vecs, 0);
round_fn4(v, msg_vecs, 1);
@@ -714,24 +719,29 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
transpose_vecs_256(&out[8]);
}
-INLINE void load_offsets8(uint64_t offset, const uint64_t deltas[8],
- __m256i *out_lo, __m256i *out_hi) {
- __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset),
- _mm512_loadu_si512((const __m512i *)deltas));
- *out_lo = _mm512_cvtepi64_epi32(a);
- *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(a, 32));
+INLINE void load_counters8(uint64_t counter, bool increment_counter,
+ __m256i *out_lo, __m256i *out_hi) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ __m512i mask_vec = _mm512_set1_epi64(mask);
+ __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ deltas = _mm512_and_si512(mask_vec, deltas);
+ __m512i counters =
+ _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
+ *out_lo = _mm512_cvtepi64_epi32(counters);
+ *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
}
void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t offset,
- offset_deltas_t offset_deltas, uint8_t flags,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m256i h_vecs[8] = {
set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
};
- __m256i offset_low_vec, offset_high_vec;
- load_offsets8(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+ __m256i counter_low_vec, counter_high_vec;
+ load_counters8(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
@@ -744,10 +754,10 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m256i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
- offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec,
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn8(v, msg_vecs, 0);
round_fn8(v, msg_vecs, 1);
@@ -1018,12 +1028,16 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
transpose_vecs_512(out);
}
-INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16],
- __m512i *out_lo, __m512i *out_hi) {
- __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset),
- _mm512_loadu_si512((const __m512i *)&deltas[0]));
- __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset),
- _mm512_loadu_si512((const __m512i *)&deltas[8]));
+INLINE void load_counters16(uint64_t counter, bool increment_counter,
+ __m512i *out_lo, __m512i *out_hi) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ __m512i mask_vec = _mm512_set1_epi64(mask);
+ __m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ deltas_a = _mm512_and_si512(mask_vec, deltas_a);
+ __m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+ deltas_b = _mm512_and_si512(mask_vec, deltas_b);
+ __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a);
+ __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b);
__m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
22, 24, 26, 28, 30);
__m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
@@ -1033,16 +1047,17 @@ INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16],
}
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t offset,
- offset_deltas_t offset_deltas, uint8_t flags,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end,
uint8_t *out) {
__m512i h_vecs[8] = {
set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
};
- __m512i offset_low_vec, offset_high_vec;
- load_offsets16(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+ __m512i counter_low_vec, counter_high_vec;
+ load_counters16(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
@@ -1055,10 +1070,10 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m512i v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
- offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec,
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn16(v, msg_vecs, 0);
round_fn16(v, msg_vecs, 1);
@@ -1114,7 +1129,7 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
*/
INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t offset,
+ const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
@@ -1124,7 +1139,7 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
if (blocks == 1) {
block_flags |= flags_end;
}
- blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, offset,
+ blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
@@ -1135,39 +1150,47 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t offset_deltas,
+ uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= 16) {
- blake3_hash16_avx512(inputs, blocks, key, offset, offset_deltas, flags,
+ blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 16;
+ }
inputs += 16;
num_inputs -= 16;
- offset += offset_deltas[16];
out = &out[16 * BLAKE3_OUT_LEN];
}
while (num_inputs >= 8) {
- blake3_hash8_avx512(inputs, blocks, key, offset, offset_deltas, flags,
+ blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 8;
+ }
inputs += 8;
num_inputs -= 8;
- offset += offset_deltas[8];
out = &out[8 * BLAKE3_OUT_LEN];
}
while (num_inputs >= 4) {
- blake3_hash4_avx512(inputs, blocks, key, offset, offset_deltas, flags,
+ blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 4;
+ }
inputs += 4;
num_inputs -= 4;
- offset += offset_deltas[4];
out = &out[4 * BLAKE3_OUT_LEN];
}
while (num_inputs > 0) {
- hash_one_avx512(inputs[0], blocks, key, offset, flags, flags_start,
+ hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
inputs += 1;
num_inputs -= 1;
- offset += offset_deltas[1];
out = &out[BLAKE3_OUT_LEN];
}
}
diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h
index af55d93..9a44391 100644
--- a/src/c/blake3_impl.h
+++ b/src/c/blake3_impl.h
@@ -42,19 +42,6 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
};
-// 17 is 1 + the largest supported SIMD degree. Each hash_many() implementation
-// can thus do `offset += offset_deltas[DEGREE]` at the end of each batch.
-typedef const uint64_t offset_deltas_t[17];
-
-static offset_deltas_t CHUNK_OFFSET_DELTAS = {
- BLAKE3_CHUNK_LEN * 0, BLAKE3_CHUNK_LEN * 1, BLAKE3_CHUNK_LEN * 2,
- BLAKE3_CHUNK_LEN * 3, BLAKE3_CHUNK_LEN * 4, BLAKE3_CHUNK_LEN * 5,
- BLAKE3_CHUNK_LEN * 6, BLAKE3_CHUNK_LEN * 7, BLAKE3_CHUNK_LEN * 8,
- BLAKE3_CHUNK_LEN * 9, BLAKE3_CHUNK_LEN * 10, BLAKE3_CHUNK_LEN * 11,
- BLAKE3_CHUNK_LEN * 12, BLAKE3_CHUNK_LEN * 13, BLAKE3_CHUNK_LEN * 14,
- BLAKE3_CHUNK_LEN * 15, BLAKE3_CHUNK_LEN * 16,
-};
-
// Count the number of 1 bits.
INLINE uint8_t popcnt(uint64_t x) {
#if __POPCNT__
@@ -69,10 +56,10 @@ INLINE uint8_t popcnt(uint64_t x) {
#endif
}
-INLINE uint32_t offset_low(uint64_t offset) { return (uint32_t)offset; }
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
-INLINE uint32_t offset_high(uint64_t offset) {
- return (uint32_t)(offset >> 32);
+INLINE uint32_t counter_high(uint64_t counter) {
+ return (uint32_t)(counter >> 32);
}
INLINE uint32_t load32(const void *src) {
@@ -96,50 +83,50 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
// Declarations for implementation-specific functions.
void blake3_compress_in_place_portable(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_in_place_sse41(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_in_place_avx512(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_portable(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_compress_xof_sse41(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_compress_xof_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
- uint8_t block_len, uint64_t offset,
+ uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t od,
+ uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t od, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
- uint8_t *out);
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t od, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
- uint8_t *out);
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t od, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
- uint8_t *out);
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t od, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
- uint8_t *out);
+ uint64_t counter, bool increment_counter,
+ uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out);
diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c
index 0ffa17e..7335c19 100644
--- a/src/c/blake3_neon.c
+++ b/src/c/blake3_neon.c
@@ -212,26 +212,28 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
transpose_vecs_128(&out[12]);
}
-INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
- uint32x4_t *out_lo, uint32x4_t *out_hi) {
- *out_lo =
- set4(offset_low(offset + deltas[0]), offset_low(offset + deltas[1]),
- offset_low(offset + deltas[2]), offset_low(offset + deltas[3]));
- *out_hi =
- set4(offset_high(offset + deltas[0]), offset_high(offset + deltas[1]),
- offset_high(offset + deltas[2]), offset_high(offset + deltas[3]));
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+ uint32x4_t *out_low, uint32x4_t *out_high) {
+ uint64_t mask = (increment_counter ? ~0 : 0);
+ *out_low = set4(
+ counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+ counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+ *out_high = set4(
+ counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+ counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
}
void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
- const uint32_t key[8], uint64_t offset,
- offset_deltas_t offset_deltas, uint8_t flags,
+ const uint32_t key[8], uint64_t counter,
+ bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
uint32x4_t h_vecs[8] = {
set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
};
- uint32x4_t offset_low_vec, offset_high_vec;
- load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+ uint32x4_t counter_low_vec, counter_high_vec;
+ load_counters4(counter, increment_counter, &counter_low_vec,
+ &counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
@@ -244,10 +246,10 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
uint32x4_t v[16] = {
- h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
- h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
- set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
- offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec,
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn4(v, msg_vecs, 0);
round_fn4(v, msg_vecs, 1);
@@ -289,8 +291,8 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
*/
INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
- const uint32_t key[8], uint64_t offset, uint8_t flags,
- uint8_t flags_start, uint8_t flags_end,
+ const uint32_t key[8], uint64_t counter,
+ uint8_t flags, uint8_t flags_start, uint8_t flags_end,
uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
@@ -302,7 +304,7 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
// TODO: Implement compress_neon. However note that according to
// https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
// compress_neon might not be any faster than compress_portable.
- blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, offset,
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
@@ -313,23 +315,27 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
- uint64_t offset, offset_deltas_t offset_deltas,
+ uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= 4) {
- blake3_hash4_neon(inputs, blocks, key, offset, offset_deltas, flags,
+ blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 4;
+ }
inputs += 4;
num_inputs -= 4;
- offset += offset_deltas[4];
out = &out[4 * BLAKE3_OUT_LEN];
}
while (num_inputs > 0) {
- hash_one_neon(inputs[0], blocks, key, offset, flags, flags_start, flags_end,
- out);
+ hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
+ flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
inputs += 1;
num_inputs -= 1;
- offset += offset_deltas[1];
out = &out[BLAKE3_OUT_LEN];
}
}
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
index e66f102..f20de2c 100644
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@@ -1,4 +1,4 @@
-use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
pub const DEGREE: usize = 16;
@@ -7,10 +7,10 @@ pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) {
- ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, offset, flags)
+ ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
}
// Unsafe because this may only be called on platforms supporting AVX-512.
@@ -18,7 +18,7 @@ pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [u8; 64] {
let mut out = [0u8; 64];
@@ -26,7 +26,7 @@ pub unsafe fn compress_xof(
cv.as_ptr(),
block.as_ptr(),
block_len,
- offset,
+ counter,
flags,
out.as_mut_ptr(),
);
@@ -37,8 +37,8 @@ pub unsafe fn compress_xof(
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
key: &CVWords,
- offset: u64,
- offset_deltas: &OffsetDeltas,
+ counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -53,8 +53,8 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs.len(),
A::CAPACITY / BLOCK_LEN,
key.as_ptr(),
- offset,
- offset_deltas.as_ptr(),
+ counter,
+ increment_counter.yes(),
flags,
flags_start,
flags_end,
@@ -68,14 +68,14 @@ pub mod ffi {
cv: *mut u32,
block: *const u8,
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_avx512(
cv: *const u32,
block: *const u8,
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
out: *mut u8,
);
@@ -84,13 +84,14 @@ pub mod ffi {
num_inputs: usize,
blocks: usize,
key: *const u32,
- offset: u64,
- offset_deltas: *const u64,
+ counter: u64,
+ increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
+
}
}
diff --git a/src/c_neon.rs b/src/c_neon.rs
index 029bf7e..34ef074 100644
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@@ -1,4 +1,4 @@
-use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
pub const DEGREE: usize = 4;
@@ -6,8 +6,8 @@ pub const DEGREE: usize = 4;
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
key: &CVWords,
- offset: u64,
- offset_deltas: &OffsetDeltas,
+ counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -22,8 +22,8 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs.len(),
A::CAPACITY / BLOCK_LEN,
key.as_ptr(),
- offset,
- offset_deltas.as_ptr(),
+ counter,
+ increment_counter.yes(),
flags,
flags_start,
flags_end,
@@ -40,7 +40,7 @@ pub extern "C" fn blake3_compress_in_place_portable(
cv: *mut u32,
block: *const u8,
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) {
unsafe {
@@ -48,7 +48,7 @@ pub extern "C" fn blake3_compress_in_place_portable(
&mut *(cv as *mut [u32; 8]),
&*(block as *const [u8; 64]),
block_len,
- offset,
+ counter,
flags,
)
}
@@ -61,8 +61,8 @@ pub mod ffi {
num_inputs: usize,
blocks: usize,
key: *const u32,
- offset: u64,
- offset_deltas: *const u64,
+ counter: u64,
+ increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
diff --git a/src/lib.rs b/src/lib.rs
index 6489c04..55ac42b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -63,33 +63,6 @@ const MSG_SCHEDULE: [[usize; 16]; 7] = [
[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
];
-// 17 is 1 + the largest supported SIMD degree (including AVX-512, currently in C).
-// Each hash_many() implementation can thus do `offset += offset_deltas[DEGREE]`
-// at the end of each batch.
-type OffsetDeltas = [u64; 17];
-
-const CHUNK_OFFSET_DELTAS: &OffsetDeltas = &[
- CHUNK_LEN as u64 * 0,
- CHUNK_LEN as u64 * 1,
- CHUNK_LEN as u64 * 2,
- CHUNK_LEN as u64 * 3,
- CHUNK_LEN as u64 * 4,
- CHUNK_LEN as u64 * 5,
- CHUNK_LEN as u64 * 6,
- CHUNK_LEN as u64 * 7,
- CHUNK_LEN as u64 * 8,
- CHUNK_LEN as u64 * 9,
- CHUNK_LEN as u64 * 10,
- CHUNK_LEN as u64 * 11,
- CHUNK_LEN as u64 * 12,
- CHUNK_LEN as u64 * 13,
- CHUNK_LEN as u64 * 14,
- CHUNK_LEN as u64 * 15,
- CHUNK_LEN as u64 * 16,
-];
-
-const PARENT_OFFSET_DELTAS: &OffsetDeltas = &[0; 17];
-
// These are the internal flags that we use to domain separate root/non-root,
// chunk/parent, and chunk beginning/middle/end. These get set at the high end
// of the block flags word in the compression function, so their values start
@@ -101,12 +74,12 @@ const ROOT: u8 = 1 << 3;
const KEYED_HASH: u8 = 1 << 4;
const DERIVE_KEY: u8 = 1 << 5;
-fn offset_low(offset: u64) -> u32 {
- offset as u32
+fn counter_low(counter: u64) -> u32 {
+ counter as u32
}
-fn offset_high(offset: u64) -> u32 {
- (offset >> 32) as u32
+fn counter_high(counter: u64) -> u32 {
+ (counter >> 32) as u32
}
/// A BLAKE3 output of the default size, 32 bytes, which implements
@@ -181,7 +154,7 @@ struct Output {
input_chaining_value: CVWords,
block: [u8; 64],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
platform: Platform,
}
@@ -193,14 +166,14 @@ impl Output {
&mut cv,
&self.block,
self.block_len,
- self.offset,
+ self.counter,
self.flags,
);
platform::le_bytes_from_words_32(&cv)
}
fn root_hash(&self) -> Hash {
- debug_assert_eq!(self.offset, 0);
+ debug_assert_eq!(self.counter, 0);
let mut cv = self.input_chaining_value;
self.platform
.compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
@@ -212,7 +185,7 @@ impl Output {
&self.input_chaining_value,
&self.block,
self.block_len,
- self.offset,
+ self.counter,
self.flags | ROOT,
)
}
@@ -221,7 +194,7 @@ impl Output {
#[derive(Clone)]
struct ChunkState {
cv: CVWords,
- offset: u64,
+ chunk_counter: u64,
buf: [u8; BLOCK_LEN],
buf_len: u8,
blocks_compressed: u8,
@@ -230,10 +203,10 @@ struct ChunkState {
}
impl ChunkState {
- fn new(key: &CVWords, offset: u64, flags: u8, platform: Platform) -> Self {
+ fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
Self {
cv: *key,
- offset,
+ chunk_counter,
buf: [0; BLOCK_LEN],
buf_len: 0,
blocks_compressed: 0,
@@ -242,15 +215,6 @@ impl ChunkState {
}
}
- fn reset(&mut self, key: &CVWords, new_offset: u64) {
- debug_assert_eq!(new_offset % CHUNK_LEN as u64, 0);
- self.cv = *key;
- self.offset = new_offset;
- self.buf = [0; BLOCK_LEN];
- self.buf_len = 0;
- self.blocks_compressed = 0;
- }
-
fn len(&self) -> usize {
BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize
}
@@ -283,7 +247,7 @@ impl ChunkState {
&mut self.cv,
&self.buf,
BLOCK_LEN as u8,
- self.offset,
+ self.chunk_counter,
block_flags,
);
self.buf_len = 0;
@@ -299,7 +263,7 @@ impl ChunkState {
&mut self.cv,
array_ref!(input, 0, BLOCK_LEN),
BLOCK_LEN as u8,
- self.offset,
+ self.chunk_counter,
block_flags,
);
self.blocks_compressed += 1;
@@ -318,7 +282,7 @@ impl ChunkState {
input_chaining_value: self.cv,
block: self.buf,
block_len: self.buf_len,
- offset: self.offset,
+ counter: self.chunk_counter,
flags: block_flags,
platform: self.platform,
}
@@ -330,9 +294,9 @@ impl fmt::Debug for ChunkState {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
- "ChunkState {{ len: {}, offset: {}, flags: {:?}, platform: {:?} }}",
+ "ChunkState {{ len: {}, chunk_counter: {}, flags: {:?}, platform: {:?} }}",
self.len(),
- self.offset,
+ self.chunk_counter,
self.flags,
self.platform
)
@@ -354,6 +318,23 @@ impl fmt::Debug for ChunkState {
// use full-width SIMD vectors for parent hashing. Without parallel parent
// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
+// pub for benchmarks
+#[doc(hidden)]
+#[derive(Clone, Copy)]
+pub enum IncrementCounter {
+ Yes,
+ No,
+}
+
+impl IncrementCounter {
+ fn yes(&self) -> bool {
+ match self {
+ IncrementCounter::Yes => true,
+ IncrementCounter::No => false,
+ }
+ }
+}
+
// The largest power of two less than or equal to `n`, used for left_len()
// immediately below, and also directly in Hasher::update().
fn largest_power_of_two_leq(n: usize) -> usize {
@@ -395,14 +376,13 @@ where
fn compress_chunks_parallel(
input: &[u8],
key: &CVWords,
- offset: u64,
+ chunk_counter: u64,
flags: u8,
platform: Platform,
out: &mut [u8],
) -> usize {
debug_assert!(!input.is_empty(), "empty chunks below the root");
debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
- debug_assert_eq!(offset % CHUNK_LEN as u64, 0, "invalid offset");
let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new();
@@ -412,8 +392,8 @@ fn compress_chunks_parallel(
platform.hash_many(
&chunks_array,
key,
- offset,
- CHUNK_OFFSET_DELTAS,
+ chunk_counter,
+ IncrementCounter::Yes,
flags,
CHUNK_START,
CHUNK_END,
@@ -424,8 +404,8 @@ fn compress_chunks_parallel(
// chunk (meaning the empty message) is a different codepath.
let chunks_so_far = chunks_array.len();
if !chunks_exact.remainder().is_empty() {
- let chunk_offset = offset + (chunks_so_far * CHUNK_LEN) as u64;
- let mut chunk_state = ChunkState::new(key, chunk_offset, flags, platform);
+ let counter = chunk_counter + chunks_so_far as u64;
+ let mut chunk_state = ChunkState::new(key, counter, flags, platform);
chunk_state.update(chunks_exact.remainder());
*array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) =
chunk_state.output().chaining_value();
@@ -462,8 +442,8 @@ fn compress_parents_parallel(
platform.hash_many(
&parents_array,
key,
- 0, // Parents always use offset 0.
- PARENT_OFFSET_DELTAS,
+ 0, // Parents always use counter 0.
+ IncrementCounter::No,
flags | PARENT,
0, // Parents have no start flags.
0, // Parents have no end flags.
@@ -496,7 +476,7 @@ fn compress_parents_parallel(
fn compress_subtree_wide(
input: &[u8],
key: &CVWords,
- offset: u64,
+ chunk_counter: u64,
flags: u8,
platform: Platform,
out: &mut [u8],
@@ -505,7 +485,7 @@ fn compress_subtree_wide(
// when it is 1. This allows Rayon the option of multi-threading even the
// 2-chunk case, which can help performance on smaller platforms.
if input.len() <= platform.simd_degree() * CHUNK_LEN {
- return compress_chunks_parallel(input, key, offset, flags, platform, out);
+ return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
}
// With more than simd_degree chunks, we need to recurse. Start by dividing
@@ -514,7 +494,7 @@ fn compress_subtree_wide(
// of 3 or something, we'll need a more complicated strategy.)
debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
let (left, right) = input.split_at(left_len(input.len()));
- let right_offset = offset + left.len() as u64;
+ let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
// Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
// account for the special case of returning 2 outputs when the SIMD degree
@@ -531,8 +511,8 @@ fn compress_subtree_wide(
// Recurse! This uses multiple threads if the "rayon" feature is enabled.
let (left_n, right_n) = join(
- || compress_subtree_wide(left, key, offset, flags, platform, left_out),
- || compress_subtree_wide(right, key, right_offset, flags, platform, right_out),
+ || compress_subtree_wide(left, key, chunk_counter, flags, platform, left_out),
+ || compress_subtree_wide(right, key, right_chunk_counter, flags, platform, right_out),
);
// The special case again. If simd_degree=1, then we'll have left_n=1 and
@@ -568,13 +548,14 @@ fn compress_subtree_wide(
fn compress_subtree_to_parent_node(
input: &[u8],
key: &CVWords,
- offset: u64,
+ chunk_counter: u64,
flags: u8,
platform: Platform,
) -> [u8; BLOCK_LEN] {
debug_assert!(input.len() > CHUNK_LEN);
let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
- let mut num_cvs = compress_subtree_wide(input, &key, offset, flags, platform, &mut cv_array);
+ let mut num_cvs =
+ compress_subtree_wide(input, &key, chunk_counter, flags, platform, &mut cv_array);
debug_assert!(num_cvs >= 2);
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
@@ -607,7 +588,7 @@ fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
input_chaining_value: *key,
block: compress_subtree_to_parent_node(input, key, 0, flags, platform),
block_len: BLOCK_LEN as u8,
- offset: 0,
+ counter: 0,
flags: flags | PARENT,
platform,
}
@@ -646,7 +627,7 @@ fn parent_node_output(
input_chaining_value: *key,
block,
block_len: BLOCK_LEN as u8,
- offset: 0,
+ counter: 0,
flags: flags | PARENT,
platform,
}
@@ -694,11 +675,6 @@ impl Hasher {
Self::new_internal(&key_words, DERIVE_KEY)
}
- /// The total number of input bytes so far.
- pub fn count(&self) -> u64 {
- self.chunk_state.offset + self.chunk_state.len() as u64
- }
-
// See comment in push_cv.
fn merge_cv_stack(&mut self, total_len: u64) {
let post_merge_stack_len = total_len.count_ones() as usize;
@@ -716,7 +692,7 @@ impl Hasher {
}
}
- fn push_cv(&mut self, new_cv: &CVBytes, offset: u64) {
+ fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
// In reference_impl.rs, we merge the new CV with existing CVs from the
// stack before pushing it. We can do that because we know more input
// is coming, so we know none of the merges are root.
@@ -739,10 +715,10 @@ impl Hasher {
// merging with the new CV itself.
//
// We still use the "count the 1 bits" algorithm, adjusted slightly for
- // this setting, using the offset (the start of the new CV's bytes)
- // rather than the final total (the end of the new CV's bytes). That
+ // this setting, using the new chunk's counter numer (the previous
+ // total number of chunks) rather than new total number of chunks. That
// algorithm is explained in detail in the spec.
- self.merge_cv_stack(offset);
+ self.merge_cv_stack(chunk_counter);
self.cv_stack.push(*new_cv);
}
@@ -762,9 +738,13 @@ impl Hasher {
// Then we'll proceed to hashing whole chunks below.
debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
let chunk_cv = self.chunk_state.output().chaining_value();
- self.push_cv(&chunk_cv, self.chunk_state.offset);
- let new_offset = self.chunk_state.offset + CHUNK_LEN as u64;
- self.chunk_state.reset(&self.key, new_offset);
+ self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
+ self.chunk_state = ChunkState::new(
+ &self.key,
+ self.chunk_state.chunk_counter + 1,
+ self.chunk_state.flags,
+ self.chunk_state.platform,
+ );
} else {
return self;
}
@@ -786,32 +766,33 @@ impl Hasher {
while input.len() > CHUNK_LEN {
debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
- debug_assert_eq!(self.chunk_state.offset % CHUNK_LEN as u64, 0);
let mut subtree_len = largest_power_of_two_leq(input.len());
+ let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
// Shrink the subtree_len until it evenly divides the count so far.
// We know it's a power of 2, so we can use a bitmask rather than
// the more expensive modulus operation. Note that if the caller
// consistently passes power-of-2 inputs of the same size (as is
// hopefully typical), we'll always skip over this loop.
- while (subtree_len - 1) as u64 & self.chunk_state.offset != 0 {
+ while (subtree_len - 1) as u64 & count_so_far != 0 {
subtree_len /= 2;
}
// The shrunken subtree_len might now be 1 chunk long. If so, hash
// that one chunk by itself. Otherwise, compress the subtree into a
// pair of CVs.
+ let subtree_chunks = (subtree_len / CHUNK_LEN) as u64;
if subtree_len <= CHUNK_LEN {
debug_assert_eq!(subtree_len, CHUNK_LEN);
self.push_cv(
&ChunkState::new(
&self.key,
- self.chunk_state.offset,
+ self.chunk_state.chunk_counter,
self.chunk_state.flags,
self.chunk_state.platform,
)
.update(&input[..subtree_len])
.output()
.chaining_value(),
- self.chunk_state.offset,
+ self.chunk_state.chunk_counter,
);
} else {
// This is the high-performance happy path, though getting here
@@ -819,7 +800,7 @@ impl Hasher {
let cv_pair = compress_subtree_to_parent_node(
&input[..subtree_len],
&self.key,
- self.chunk_state.offset,
+ self.chunk_state.chunk_counter,
self.chunk_state.flags,
self.chunk_state.platform,
);
@@ -828,10 +809,13 @@ impl Hasher {
// Push the two CVs we received into the CV stack in order. Because
// the stack merges lazily, this guarantees we aren't merging the
// root.
- self.push_cv(left_cv, self.chunk_state.offset);
- self.push_cv(right_cv, self.chunk_state.offset + (subtree_len as u64 / 2));
+ self.push_cv(left_cv, self.chunk_state.chunk_counter);
+ self.push_cv(
+ right_cv,
+ self.chunk_state.chunk_counter + (subtree_chunks / 2),
+ );
}
- self.chunk_state.offset += subtree_len as u64;
+ self.chunk_state.chunk_counter += subtree_chunks;
input = &input[subtree_len..];
}
@@ -842,7 +826,7 @@ impl Hasher {
// Having added some input to the chunk_state, we know what's in
// the CV stack won't become the root node, and we can do an extra
// merge. This simplifies finalize().
- self.merge_cv_stack(self.chunk_state.offset);
+ self.merge_cv_stack(self.chunk_state.chunk_counter);
}
self
@@ -853,7 +837,7 @@ impl Hasher {
// also. Convert it directly into an Output. Otherwise, we need to
// merge subtrees below.
if self.cv_stack.is_empty() {
- debug_assert_eq!(self.chunk_state.offset, 0);
+ debug_assert_eq!(self.chunk_state.chunk_counter, 0);
return self.chunk_state.output();
}
@@ -874,7 +858,7 @@ impl Hasher {
if self.chunk_state.len() > 0 {
debug_assert_eq!(
self.cv_stack.len(),
- self.chunk_state.offset.count_ones() as usize,
+ self.chunk_state.chunk_counter.count_ones() as usize,
"cv stack does not need a merge"
);
output = self.chunk_state.output();
@@ -930,10 +914,8 @@ impl fmt::Debug for Hasher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
- "Hasher {{ count: {}, flags: {:?}, platform: {:?} }}",
- self.count(),
- self.chunk_state.flags,
- self.chunk_state.platform
+ "Hasher {{ flags: {:?}, platform: {:?} }}",
+ self.chunk_state.flags, self.chunk_state.platform
)
}
}
@@ -982,19 +964,19 @@ impl OutputReader {
buf = &mut buf[take..];
self.position_within_block += take as u8;
if self.position_within_block == BLOCK_LEN as u8 {
- self.inner.offset += BLOCK_LEN as u64;
+ self.inner.counter += 1;
self.position_within_block = 0;
}
}
}
pub fn position(&self) -> u64 {
- self.inner.offset + self.position_within_block as u64
+ self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64
}
pub fn set_position(&mut self, position: u64) {
self.position_within_block = (position % BLOCK_LEN as u64) as u8;
- self.inner.offset = position - self.position_within_block as u64;
+ self.inner.counter = position / BLOCK_LEN as u64;
}
}
diff --git a/src/platform.rs b/src/platform.rs
index 66d70f6..abf9d30 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,4 +1,4 @@
-use crate::{portable, CVWords, OffsetDeltas, BLOCK_LEN};
+use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
use arrayref::{array_mut_ref, array_ref};
#[cfg(feature = "c_avx512")]
@@ -108,25 +108,25 @@ impl Platform {
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) {
match self {
- Platform::Portable => portable::compress_in_place(cv, block, block_len, offset, flags),
+ Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
- sse41::compress_in_place(cv, block, block_len, offset, flags)
+ sse41::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(feature = "c_avx512")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
- c_avx512::compress_in_place(cv, block, block_len, offset, flags)
+ c_avx512::compress_in_place(cv, block, block_len, counter, flags)
},
// No NEON compress_in_place() implementation yet.
#[cfg(feature = "c_neon")]
- Platform::NEON => portable::compress_in_place(cv, block, block_len, offset, flags),
+ Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
}
}
@@ -135,25 +135,25 @@ impl Platform {
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [u8; 64] {
match self {
- Platform::Portable => portable::compress_xof(cv, block, block_len, offset, flags),
+ Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
- sse41::compress_xof(cv, block, block_len, offset, flags)
+ sse41::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(feature = "c_avx512")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
- c_avx512::compress_xof(cv, block, block_len, offset, flags)
+ c_avx512::compress_xof(cv, block, block_len, counter, flags)
},
// No NEON compress_xof() implementation yet.
#[cfg(feature = "c_neon")]
- Platform::NEON => portable::compress_xof(cv, block, block_len, offset, flags),
+ Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
}
}
@@ -171,8 +171,8 @@ impl Platform {
&self,
inputs: &[&A],
key: &CVWords,
- offset: u64,
- offset_deltas: &OffsetDeltas,
+ counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -182,8 +182,8 @@ impl Platform {
Platform::Portable => portable::hash_many(
inputs,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
@@ -195,8 +195,8 @@ impl Platform {
sse41::hash_many(
inputs,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
@@ -209,8 +209,8 @@ impl Platform {
avx2::hash_many(
inputs,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
@@ -224,8 +224,8 @@ impl Platform {
c_avx512::hash_many(
inputs,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
@@ -238,8 +238,8 @@ impl Platform {
c_neon::hash_many(
inputs,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
diff --git a/src/portable.rs b/src/portable.rs
index fa0e17d..0a569ce 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -1,5 +1,6 @@
use crate::{
- offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
+ OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref};
@@ -38,7 +39,7 @@ fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [u32; 16] {
let block_words = crate::platform::words_from_le_bytes_64(block);
@@ -56,8 +57,8 @@ fn compress_pre(
IV[1],
IV[2],
IV[3],
- offset_low(offset),
- offset_high(offset),
+ counter_low(counter),
+ counter_high(counter),
block_len as u32,
flags as u32,
];
@@ -77,10 +78,10 @@ pub fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) {
- let state = compress_pre(cv, block, block_len, offset, flags);
+ let state = compress_pre(cv, block, block_len, counter, flags);
cv[0] = state[0] ^ state[8];
cv[1] = state[1] ^ state[9];
@@ -96,10 +97,10 @@ pub fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [u8; 64] {
- let mut state = compress_pre(cv, block, block_len, offset, flags);
+ let mut state = compress_pre(cv, block, block_len, counter, flags);
state[0] ^= state[8];
state[1] ^= state[9];
state[2] ^= state[10];
@@ -122,7 +123,7 @@ pub fn compress_xof(
pub fn hash1<A: arrayvec::Array<Item = u8>>(
input: &A,
key: &CVWords,
- offset: u64,
+ counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -140,7 +141,7 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
- offset,
+ counter,
block_flags,
);
block_flags = flags;
@@ -152,8 +153,8 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
pub fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
key: &CVWords,
- mut offset: u64,
- offset_deltas: &OffsetDeltas,
+ mut counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -164,13 +165,15 @@ pub fn hash_many<A: arrayvec::Array<Item = u8>>(
hash1(
input,
key,
- offset,
+ counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
- offset += offset_deltas[1];
+ if increment_counter.yes() {
+ counter += 1;
+ }
}
}
diff --git a/src/sse41.rs b/src/sse41.rs
index e45b22e..77cb27e 100644
--- a/src/sse41.rs
+++ b/src/sse41.rs
@@ -4,7 +4,8 @@ use core::arch::x86::*;
use core::arch::x86_64::*;
use crate::{
- offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
+ OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
@@ -129,15 +130,15 @@ unsafe fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [__m128i; 4] {
let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8);
let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8);
let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
let row4 = &mut set4(
- offset_low(offset),
- offset_high(offset),
+ counter_low(counter),
+ counter_high(counter),
block_len as u32,
flags as u32,
);
@@ -313,10 +314,10 @@ pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) {
- let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, offset, flags);
+ let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, counter, flags);
storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8);
storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8);
}
@@ -326,11 +327,11 @@ pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [u8; 64] {
let [mut row1, mut row2, mut row3, mut row4] =
- compress_pre(cv, block, block_len, offset, flags);
+ compress_pre(cv, block, block_len, counter, flags);
row1 = xor(row1, row3);
row2 = xor(row2, row4);
row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8));
@@ -506,19 +507,20 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize)
}
#[inline(always)]
-unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m128i, __m128i) {
+unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
+ let mask = if increment_counter.yes() { !0 } else { 0 };
(
set4(
- offset_low(offset + offset_deltas[0]),
- offset_low(offset + offset_deltas[1]),
- offset_low(offset + offset_deltas[2]),
- offset_low(offset + offset_deltas[3]),
+ counter_low(counter + (mask & 0)),
+ counter_low(counter + (mask & 1)),
+ counter_low(counter + (mask & 2)),
+ counter_low(counter + (mask & 3)),
),
set4(
- offset_high(offset + offset_deltas[0]),
- offset_high(offset + offset_deltas[1]),
- offset_high(offset + offset_deltas[2]),
- offset_high(offset + offset_deltas[3]),
+ counter_high(counter + (mask & 0)),
+ counter_high(counter + (mask & 1)),
+ counter_high(counter + (mask & 2)),
+ counter_high(counter + (mask & 3)),
),
)
}
@@ -528,8 +530,8 @@ pub unsafe fn hash4(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
- offset: u64,
- offset_deltas: &OffsetDeltas,
+ counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -545,7 +547,7 @@ pub unsafe fn hash4(
set1(key[6]),
set1(key[7]),
];
- let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
+ let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
@@ -573,8 +575,8 @@ pub unsafe fn hash4(
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
- offset_low_vec,
- offset_high_vec,
+ counter_low_vec,
+ counter_high_vec,
block_len_vec,
block_flags_vec,
];
@@ -616,7 +618,7 @@ pub unsafe fn hash4(
unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
input: &A,
key: &CVWords,
- offset: u64,
+ counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -634,7 +636,7 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
- offset,
+ counter,
block_flags,
);
block_flags = flags;
@@ -647,8 +649,8 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
mut inputs: &[&A],
key: &CVWords,
- mut offset: u64,
- offset_deltas: &OffsetDeltas,
+ mut counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -664,28 +666,32 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
input_ptrs,
blocks,
key,
- offset,
- offset_deltas,
+ counter,
+ increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
+ if increment_counter.yes() {
+ counter += DEGREE as u64;
+ }
inputs = &inputs[DEGREE..];
- offset += DEGREE as u64 * offset_deltas[1];
out = &mut out[DEGREE * OUT_LEN..];
}
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
hash1(
input,
key,
- offset,
+ counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
- offset += offset_deltas[1];
+ if increment_counter.yes() {
+ counter += 1;
+ }
}
}
diff --git a/src/test.rs b/src/test.rs
index 404079c..e7fe96e 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1,4 +1,4 @@
-use crate::{CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
+use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
use arrayref::array_ref;
use arrayvec::ArrayVec;
use core::usize;
@@ -48,13 +48,13 @@ pub fn paint_test_input(buf: &mut [u8]) {
}
type CompressInPlaceFn =
- unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8);
+ unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
type CompressXofFn = unsafe fn(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
- offset: u64,
+ counter: u64,
flags: u8,
) -> [u8; 64];
@@ -64,18 +64,18 @@ pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn
let block_len: u8 = 61;
let mut block = [0; BLOCK_LEN];
paint_test_input(&mut block[..block_len as usize]);
- // Use an offset with set bits in both 32-bit words.
- let offset = ((5 * CHUNK_LEN as u64) << 32) + 6 * CHUNK_LEN as u64;
+ // Use a counter with set bits in both 32-bit words.
+ let counter = (5u64 << 32) + 6;
let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
let portable_out =
- crate::portable::compress_xof(&initial_state, &block, block_len, offset as u64, flags);
+ crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags);
let mut test_state = initial_state;
- unsafe { compress_in_place_fn(&mut test_state, &block, block_len, offset as u64, flags) };
+ unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) };
let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
let test_xof =
- unsafe { compress_xof_fn(&initial_state, &block, block_len, offset as u64, flags) };
+ unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) };
assert_eq!(&portable_out[..32], &test_state_bytes[..]);
assert_eq!(&portable_out[..], &test_xof[..]);
@@ -84,8 +84,8 @@ pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn
type HashManyFn<A> = unsafe fn(
inputs: &[&A],
key: &CVWords,
- offset: u64,
- offset_deltas: &OffsetDeltas,
+ counter: u64,
+ increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
@@ -101,8 +101,8 @@ pub fn test_hash_many_fn(
const NUM_INPUTS: usize = 31;
let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
crate::test::paint_test_input(&mut input_buf);
- // An offset just prior to u32::MAX.
- let offset = (1 << 32) - CHUNK_LEN as u64;
+ // A counter just prior to u32::MAX.
+ let counter = (1u64 << 32) - 1;
// First hash chunks.
let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new();
@@ -113,8 +113,8 @@ pub fn test_hash_many_fn(
crate::portable::hash_many(
&chunks,
&TEST_KEY_WORDS,
- offset,
- crate::CHUNK_OFFSET_DELTAS,
+ counter,
+ IncrementCounter::Yes,
crate::DERIVE_KEY,
crate::CHUNK_START,
crate::CHUNK_END,
@@ -126,8 +126,8 @@ pub fn test_hash_many_fn(
hash_many_chunks_fn(
&chunks[..],
&TEST_KEY_WORDS,
- offset,
- crate::CHUNK_OFFSET_DELTAS,
+ counter,
+ IncrementCounter::Yes,
crate::DERIVE_KEY,
crate::CHUNK_START,
crate::CHUNK_END,
@@ -153,7 +153,7 @@ pub fn test_hash_many_fn(
&parents,
&TEST_KEY_WORDS,
0,
- crate::PARENT_OFFSET_DELTAS,
+ IncrementCounter::No,
crate::DERIVE_KEY | crate::PARENT,
0,
0,
@@ -166,7 +166,7 @@ pub fn test_hash_many_fn(
&parents[..],
&TEST_KEY_WORDS,
0,
- crate::PARENT_OFFSET_DELTAS,
+ IncrementCounter::No,
crate::DERIVE_KEY | crate::PARENT,
0,
0,
@@ -202,10 +202,10 @@ fn test_reference_impl_size() {
}
#[test]
-fn test_offset_words() {
- let offset: u64 = (1 << 32) + 2;
- assert_eq!(crate::offset_low(offset), 2);
- assert_eq!(crate::offset_high(offset), 1);
+fn test_counter_words() {
+ let counter: u64 = (1 << 32) + 2;
+ assert_eq!(crate::counter_low(counter), 2);
+ assert_eq!(crate::counter_high(counter), 1);
}
#[test]