diff options
| author | Jack O'Connor <[email protected]> | 2019-12-12 18:21:17 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2019-12-12 21:41:30 -0500 |
| commit | b5f1e925f79589ade494289d730c1fb5c25a09b1 (patch) | |
| tree | a8608132b339b9f2c55bec46bf18d297cd0216b6 /src | |
| parent | a5cc3b28679d724020b8810ea9aecea9f54a058f (diff) | |
rename "offset" to "counter" and always increment it by 1
This is simpler than sometimes incrementing by CHUNK_LEN and other times
incrementing by BLOCK_LEN.
Diffstat (limited to 'src')
| -rw-r--r-- | src/avx2.rs | 65 | ||||
| -rw-r--r-- | src/c/blake3.h | 4 | ||||
| -rw-r--r-- | src/c/blake3_avx512.c | 143 | ||||
| -rw-r--r-- | src/c/blake3_impl.h | 57 | ||||
| -rw-r--r-- | src/c/blake3_neon.c | 56 | ||||
| -rw-r--r-- | src/c_avx512.rs | 27 | ||||
| -rw-r--r-- | src/c_neon.rs | 18 | ||||
| -rw-r--r-- | src/lib.rs | 180 | ||||
| -rw-r--r-- | src/platform.rs | 46 | ||||
| -rw-r--r-- | src/portable.rs | 31 | ||||
| -rw-r--r-- | src/sse41.rs | 68 | ||||
| -rw-r--r-- | src/test.rs | 44 |
12 files changed, 376 insertions, 363 deletions
diff --git a/src/avx2.rs b/src/avx2.rs index 471a2dc..3424a83 100644 --- a/src/avx2.rs +++ b/src/avx2.rs @@ -3,7 +3,9 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use crate::{offset_high, offset_low, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN}; +use crate::{ + counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; use arrayref::{array_mut_ref, mut_array_refs}; pub const DEGREE: usize = 8; @@ -270,27 +272,28 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) } #[inline(always)] -unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m256i, __m256i) { +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; ( set8( - offset_low(offset + offset_deltas[0]), - offset_low(offset + offset_deltas[1]), - offset_low(offset + offset_deltas[2]), - offset_low(offset + offset_deltas[3]), - offset_low(offset + offset_deltas[4]), - offset_low(offset + offset_deltas[5]), - offset_low(offset + offset_deltas[6]), - offset_low(offset + offset_deltas[7]), + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + counter_low(counter + (mask & 4)), + counter_low(counter + (mask & 5)), + counter_low(counter + (mask & 6)), + counter_low(counter + (mask & 7)), ), set8( - offset_high(offset + offset_deltas[0]), - offset_high(offset + offset_deltas[1]), - offset_high(offset + offset_deltas[2]), - offset_high(offset + offset_deltas[3]), - offset_high(offset + offset_deltas[4]), - offset_high(offset + offset_deltas[5]), - offset_high(offset + offset_deltas[6]), - offset_high(offset + offset_deltas[7]), + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + counter_high(counter + (mask & 4)), + counter_high(counter + (mask & 5)), + counter_high(counter + (mask & 6)), + counter_high(counter + (mask & 7)), ), ) } @@ -300,8 +303,8 @@ pub unsafe fn hash8( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, - offset: u64, - offset_deltas: &OffsetDeltas, + counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -317,7 +320,7 @@ pub unsafe fn hash8( set1(key[6]), set1(key[7]), ]; - let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas); + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { @@ -345,8 +348,8 @@ pub unsafe fn hash8( set1(IV[1]), set1(IV[2]), set1(IV[3]), - offset_low_vec, - offset_high_vec, + counter_low_vec, + counter_high_vec, block_len_vec, block_flags_vec, ]; @@ -384,8 +387,8 @@ pub unsafe fn hash8( pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( mut inputs: &[&A], key: &CVWords, - mut offset: u64, - offset_deltas: &OffsetDeltas, + mut counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -401,22 +404,24 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( input_ptrs, blocks, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); + if increment_counter.yes() { + counter += DEGREE as u64; + } inputs = &inputs[DEGREE..]; - offset += DEGREE as u64 * offset_deltas[1]; out = &mut out[DEGREE * OUT_LEN..]; } crate::sse41::hash_many( inputs, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, diff --git a/src/c/blake3.h b/src/c/blake3.h index 4d4a409..4c5624c 100644 --- a/src/c/blake3.h +++ b/src/c/blake3.h @@ -11,10 +11,10 @@ typedef struct { uint32_t cv[8]; - uint64_t offset; - uint16_t count; + uint64_t chunk_counter; uint8_t buf[BLAKE3_BLOCK_LEN]; uint8_t buf_len; + uint8_t blocks_compressed; uint8_t flags; } blake3_chunk_state; diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c index f30e302..2c8657c 100644 --- a/src/c/blake3_avx512.c +++ b/src/c/blake3_avx512.c @@ -111,12 +111,12 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, uint8_t flags) { + uint8_t block_len, uint64_t counter, uint8_t flags) { rows[0] = loadu_128((uint8_t *)&cv[0]); rows[1] = loadu_128((uint8_t *)&cv[4]); rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); - rows[3] = set4(offset_low(offset), offset_high(offset), (uint32_t)block_len, - (uint32_t)flags); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); @@ -281,10 +281,10 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { __m128i rows[4]; - compress_pre(rows, cv, block, block_len, offset, flags); + compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), &out[0]); storeu_128(xor_128(rows[1], rows[3]), &out[16]); storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); @@ -293,10 +293,10 @@ void blake3_compress_xof_avx512(const uint32_t cv[8], void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags) { __m128i rows[4]; - compress_pre(rows, cv, block, block_len, offset, flags); + compress_pre(rows, cv, block, block_len, counter, flags); storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); } @@ -468,24 +468,29 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, transpose_vecs_128(&out[12]); } -INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4], - __m128i *out_lo, __m128i *out_hi) { - __m256i a = _mm256_add_epi64(_mm256_set1_epi64x((int64_t)offset), - _mm256_loadu_si256((const __m256i *)deltas)); - *out_lo = _mm256_cvtepi64_epi32(a); - *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(a, 32)); +INLINE void load_counters4(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m256i mask_vec = _mm256_set1_epi64x(mask); + __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); + deltas = _mm256_and_si256(mask_vec, deltas); + __m256i counters = + _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); + *out_lo = _mm256_cvtepi64_epi32(counters); + *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); } void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t offset, - offset_deltas_t offset_deltas, uint8_t flags, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m128i h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; - __m128i offset_low_vec, offset_high_vec; - load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec); + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { @@ -498,10 +503,10 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m128i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec, + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); @@ -714,24 +719,29 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, transpose_vecs_256(&out[8]); } -INLINE void load_offsets8(uint64_t offset, const uint64_t deltas[8], - __m256i *out_lo, __m256i *out_hi) { - __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset), - _mm512_loadu_si512((const __m512i *)deltas)); - *out_lo = _mm512_cvtepi64_epi32(a); - *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(a, 32)); +INLINE void load_counters8(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas = _mm512_and_si512(mask_vec, deltas); + __m512i counters = + _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); + *out_lo = _mm512_cvtepi64_epi32(counters); + *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); } void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t offset, - offset_deltas_t offset_deltas, uint8_t flags, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m256i h_vecs[8] = { set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), }; - __m256i offset_low_vec, offset_high_vec; - load_offsets8(offset, offset_deltas, &offset_low_vec, &offset_high_vec); + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, increment_counter, &counter_low_vec, + &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { @@ -744,10 +754,10 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m256i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), - offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec, + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn8(v, msg_vecs, 0); round_fn8(v, msg_vecs, 1); @@ -1018,12 +1028,16 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, transpose_vecs_512(out); } -INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16], - __m512i *out_lo, __m512i *out_hi) { - __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset), - _mm512_loadu_si512((const __m512i *)&deltas[0])); - __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset), - _mm512_loadu_si512((const __m512i *)&deltas[8])); +INLINE void load_counters16(uint64_t counter, bool increment_counter, + __m512i *out_lo, __m512i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas_a = _mm512_and_si512(mask_vec, deltas_a); + __m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15); + deltas_b = _mm512_and_si512(mask_vec, deltas_b); + __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a); + __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b); __m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, @@ -1033,16 +1047,17 @@ INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16], } void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t offset, - offset_deltas_t offset_deltas, uint8_t flags, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { __m512i h_vecs[8] = { set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), }; - __m512i offset_low_vec, offset_high_vec; - load_offsets16(offset, offset_deltas, &offset_low_vec, &offset_high_vec); + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, increment_counter, &counter_low_vec, + &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { @@ -1055,10 +1070,10 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); __m512i v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), - offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec, + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn16(v, msg_vecs, 0); round_fn16(v, msg_vecs, 1); @@ -1114,7 +1129,7 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, */ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t offset, + const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; @@ -1124,7 +1139,7 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, if (blocks == 1) { block_flags |= flags_end; } - blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, offset, + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; @@ -1135,39 +1150,47 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t offset_deltas, + uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= 16) { - blake3_hash16_avx512(inputs, blocks, key, offset, offset_deltas, flags, + blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 16; + } inputs += 16; num_inputs -= 16; - offset += offset_deltas[16]; out = &out[16 * BLAKE3_OUT_LEN]; } while (num_inputs >= 8) { - blake3_hash8_avx512(inputs, blocks, key, offset, offset_deltas, flags, + blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 8; + } inputs += 8; num_inputs -= 8; - offset += offset_deltas[8]; out = &out[8 * BLAKE3_OUT_LEN]; } while (num_inputs >= 4) { - blake3_hash4_avx512(inputs, blocks, key, offset, offset_deltas, flags, + blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } inputs += 4; num_inputs -= 4; - offset += offset_deltas[4]; out = &out[4 * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { - hash_one_avx512(inputs[0], blocks, key, offset, flags, flags_start, + hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } inputs += 1; num_inputs -= 1; - offset += offset_deltas[1]; out = &out[BLAKE3_OUT_LEN]; } } diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h index af55d93..9a44391 100644 --- a/src/c/blake3_impl.h +++ b/src/c/blake3_impl.h @@ -42,19 +42,6 @@ static const uint8_t MSG_SCHEDULE[7][16] = { {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, }; -// 17 is 1 + the largest supported SIMD degree. Each hash_many() implementation -// can thus do `offset += offset_deltas[DEGREE]` at the end of each batch. -typedef const uint64_t offset_deltas_t[17]; - -static offset_deltas_t CHUNK_OFFSET_DELTAS = { - BLAKE3_CHUNK_LEN * 0, BLAKE3_CHUNK_LEN * 1, BLAKE3_CHUNK_LEN * 2, - BLAKE3_CHUNK_LEN * 3, BLAKE3_CHUNK_LEN * 4, BLAKE3_CHUNK_LEN * 5, - BLAKE3_CHUNK_LEN * 6, BLAKE3_CHUNK_LEN * 7, BLAKE3_CHUNK_LEN * 8, - BLAKE3_CHUNK_LEN * 9, BLAKE3_CHUNK_LEN * 10, BLAKE3_CHUNK_LEN * 11, - BLAKE3_CHUNK_LEN * 12, BLAKE3_CHUNK_LEN * 13, BLAKE3_CHUNK_LEN * 14, - BLAKE3_CHUNK_LEN * 15, BLAKE3_CHUNK_LEN * 16, -}; - // Count the number of 1 bits. INLINE uint8_t popcnt(uint64_t x) { #if __POPCNT__ @@ -69,10 +56,10 @@ INLINE uint8_t popcnt(uint64_t x) { #endif } -INLINE uint32_t offset_low(uint64_t offset) { return (uint32_t)offset; } +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } -INLINE uint32_t offset_high(uint64_t offset) { - return (uint32_t)(offset >> 32); +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); } INLINE uint32_t load32(const void *src) { @@ -96,50 +83,50 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], // Declarations for implementation-specific functions. void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags); void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], - uint8_t block_len, uint64_t offset, + uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t od, + uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t od, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out); + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t od, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out); + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t od, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out); + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t od, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, - uint8_t *out); + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c index 0ffa17e..7335c19 100644 --- a/src/c/blake3_neon.c +++ b/src/c/blake3_neon.c @@ -212,26 +212,28 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, transpose_vecs_128(&out[12]); } -INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4], - uint32x4_t *out_lo, uint32x4_t *out_hi) { - *out_lo = - set4(offset_low(offset + deltas[0]), offset_low(offset + deltas[1]), - offset_low(offset + deltas[2]), offset_low(offset + deltas[3])); - *out_hi = - set4(offset_high(offset + deltas[0]), offset_high(offset + deltas[1]), - offset_high(offset + deltas[2]), offset_high(offset + deltas[3])); +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); } void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t offset, - offset_deltas_t offset_deltas, uint8_t flags, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { uint32x4_t h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), }; - uint32x4_t offset_low_vec, offset_high_vec; - load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec); + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); uint8_t block_flags = flags | flags_start; for (size_t block = 0; block < blocks; block++) { @@ -244,10 +246,10 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); uint32x4_t v[16] = { - h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], - h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], - set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), - offset_low_vec, offset_high_vec, block_len_vec, block_flags_vec, + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, }; round_fn4(v, msg_vecs, 0); round_fn4(v, msg_vecs, 1); @@ -289,8 +291,8 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, */ INLINE void hash_one_neon(const uint8_t *input, size_t blocks, - const uint32_t key[8], uint64_t offset, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { uint32_t cv[8]; memcpy(cv, key, BLAKE3_KEY_LEN); @@ -302,7 +304,7 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks, // TODO: Implement compress_neon. However note that according to // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, // compress_neon might not be any faster than compress_portable. - blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, offset, + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); input = &input[BLAKE3_BLOCK_LEN]; blocks -= 1; @@ -313,23 +315,27 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks, void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t offset, offset_deltas_t offset_deltas, + uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { while (num_inputs >= 4) { - blake3_hash4_neon(inputs, blocks, key, offset, offset_deltas, flags, + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } inputs += 4; num_inputs -= 4; - offset += offset_deltas[4]; out = &out[4 * BLAKE3_OUT_LEN]; } while (num_inputs > 0) { - hash_one_neon(inputs[0], blocks, key, offset, flags, flags_start, flags_end, - out); + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } inputs += 1; num_inputs -= 1; - offset += offset_deltas[1]; out = &out[BLAKE3_OUT_LEN]; } } diff --git a/src/c_avx512.rs b/src/c_avx512.rs index e66f102..f20de2c 100644 --- a/src/c_avx512.rs +++ b/src/c_avx512.rs @@ -1,4 +1,4 @@ -use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN}; +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; pub const DEGREE: usize = 16; @@ -7,10 +7,10 @@ pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) { - ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, offset, flags) + ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) } // Unsafe because this may only be called on platforms supporting AVX-512. @@ -18,7 +18,7 @@ pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [u8; 64] { let mut out = [0u8; 64]; @@ -26,7 +26,7 @@ pub unsafe fn compress_xof( cv.as_ptr(), block.as_ptr(), block_len, - offset, + counter, flags, out.as_mut_ptr(), ); @@ -37,8 +37,8 @@ pub unsafe fn compress_xof( pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( inputs: &[&A], key: &CVWords, - offset: u64, - offset_deltas: &OffsetDeltas, + counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -53,8 +53,8 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( inputs.len(), A::CAPACITY / BLOCK_LEN, key.as_ptr(), - offset, - offset_deltas.as_ptr(), + counter, + increment_counter.yes(), flags, flags_start, flags_end, @@ -68,14 +68,14 @@ pub mod ffi { cv: *mut u32, block: *const u8, block_len: u8, - offset: u64, + counter: u64, flags: u8, ); pub fn blake3_compress_xof_avx512( cv: *const u32, block: *const u8, block_len: u8, - offset: u64, + counter: u64, flags: u8, out: *mut u8, ); @@ -84,13 +84,14 @@ pub mod ffi { num_inputs: usize, blocks: usize, key: *const u32, - offset: u64, - offset_deltas: *const u64, + counter: u64, + increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); + } } diff --git a/src/c_neon.rs b/src/c_neon.rs index 029bf7e..34ef074 100644 --- a/src/c_neon.rs +++ b/src/c_neon.rs @@ -1,4 +1,4 @@ -use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN}; +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; pub const DEGREE: usize = 4; @@ -6,8 +6,8 @@ pub const DEGREE: usize = 4; pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( inputs: &[&A], key: &CVWords, - offset: u64, - offset_deltas: &OffsetDeltas, + counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -22,8 +22,8 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( inputs.len(), A::CAPACITY / BLOCK_LEN, key.as_ptr(), - offset, - offset_deltas.as_ptr(), + counter, + increment_counter.yes(), flags, flags_start, flags_end, @@ -40,7 +40,7 @@ pub extern "C" fn blake3_compress_in_place_portable( cv: *mut u32, block: *const u8, block_len: u8, - offset: u64, + counter: u64, flags: u8, ) { unsafe { @@ -48,7 +48,7 @@ pub extern "C" fn blake3_compress_in_place_portable( &mut *(cv as *mut [u32; 8]), &*(block as *const [u8; 64]), block_len, - offset, + counter, flags, ) } @@ -61,8 +61,8 @@ pub mod ffi { num_inputs: usize, blocks: usize, key: *const u32, - offset: u64, - offset_deltas: *const u64, + counter: u64, + increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, @@ -63,33 +63,6 @@ const MSG_SCHEDULE: [[usize; 16]; 7] = [ [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], ]; -// 17 is 1 + the largest supported SIMD degree (including AVX-512, currently in C). -// Each hash_many() implementation can thus do `offset += offset_deltas[DEGREE]` -// at the end of each batch. -type OffsetDeltas = [u64; 17]; - -const CHUNK_OFFSET_DELTAS: &OffsetDeltas = &[ - CHUNK_LEN as u64 * 0, - CHUNK_LEN as u64 * 1, - CHUNK_LEN as u64 * 2, - CHUNK_LEN as u64 * 3, - CHUNK_LEN as u64 * 4, - CHUNK_LEN as u64 * 5, - CHUNK_LEN as u64 * 6, - CHUNK_LEN as u64 * 7, - CHUNK_LEN as u64 * 8, - CHUNK_LEN as u64 * 9, - CHUNK_LEN as u64 * 10, - CHUNK_LEN as u64 * 11, - CHUNK_LEN as u64 * 12, - CHUNK_LEN as u64 * 13, - CHUNK_LEN as u64 * 14, - CHUNK_LEN as u64 * 15, - CHUNK_LEN as u64 * 16, -]; - -const PARENT_OFFSET_DELTAS: &OffsetDeltas = &[0; 17]; - // These are the internal flags that we use to domain separate root/non-root, // chunk/parent, and chunk beginning/middle/end. These get set at the high end // of the block flags word in the compression function, so their values start @@ -101,12 +74,12 @@ const ROOT: u8 = 1 << 3; const KEYED_HASH: u8 = 1 << 4; const DERIVE_KEY: u8 = 1 << 5; -fn offset_low(offset: u64) -> u32 { - offset as u32 +fn counter_low(counter: u64) -> u32 { + counter as u32 } -fn offset_high(offset: u64) -> u32 { - (offset >> 32) as u32 +fn counter_high(counter: u64) -> u32 { + (counter >> 32) as u32 } /// A BLAKE3 output of the default size, 32 bytes, which implements @@ -181,7 +154,7 @@ struct Output { input_chaining_value: CVWords, block: [u8; 64], block_len: u8, - offset: u64, + counter: u64, flags: u8, platform: Platform, } @@ -193,14 +166,14 @@ impl Output { &mut cv, &self.block, self.block_len, - self.offset, + self.counter, self.flags, ); platform::le_bytes_from_words_32(&cv) } fn root_hash(&self) -> Hash { - debug_assert_eq!(self.offset, 0); + debug_assert_eq!(self.counter, 0); let mut cv = self.input_chaining_value; self.platform .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); @@ -212,7 +185,7 @@ impl Output { &self.input_chaining_value, &self.block, self.block_len, - self.offset, + self.counter, self.flags | ROOT, ) } @@ -221,7 +194,7 @@ impl Output { #[derive(Clone)] struct ChunkState { cv: CVWords, - offset: u64, + chunk_counter: u64, buf: [u8; BLOCK_LEN], buf_len: u8, blocks_compressed: u8, @@ -230,10 +203,10 @@ struct ChunkState { } impl ChunkState { - fn new(key: &CVWords, offset: u64, flags: u8, platform: Platform) -> Self { + fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { Self { cv: *key, - offset, + chunk_counter, buf: [0; BLOCK_LEN], buf_len: 0, blocks_compressed: 0, @@ -242,15 +215,6 @@ impl ChunkState { } } - fn reset(&mut self, key: &CVWords, new_offset: u64) { - debug_assert_eq!(new_offset % CHUNK_LEN as u64, 0); - self.cv = *key; - self.offset = new_offset; - self.buf = [0; BLOCK_LEN]; - self.buf_len = 0; - self.blocks_compressed = 0; - } - fn len(&self) -> usize { BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize } @@ -283,7 +247,7 @@ impl ChunkState { &mut self.cv, &self.buf, BLOCK_LEN as u8, - self.offset, + self.chunk_counter, block_flags, ); self.buf_len = 0; @@ -299,7 +263,7 @@ impl ChunkState { &mut self.cv, array_ref!(input, 0, BLOCK_LEN), BLOCK_LEN as u8, - self.offset, + self.chunk_counter, block_flags, ); self.blocks_compressed += 1; @@ -318,7 +282,7 @@ impl ChunkState { input_chaining_value: self.cv, block: self.buf, block_len: self.buf_len, - offset: self.offset, + counter: self.chunk_counter, flags: block_flags, platform: self.platform, } @@ -330,9 +294,9 @@ impl fmt::Debug for ChunkState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "ChunkState {{ len: {}, offset: {}, flags: {:?}, platform: {:?} }}", + "ChunkState {{ len: {}, chunk_counter: {}, flags: {:?}, platform: {:?} }}", self.len(), - self.offset, + self.chunk_counter, self.flags, self.platform ) @@ -354,6 +318,23 @@ impl fmt::Debug for ChunkState { // use full-width SIMD vectors for parent hashing. Without parallel parent // hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. +// pub for benchmarks +#[doc(hidden)] +#[derive(Clone, Copy)] +pub enum IncrementCounter { + Yes, + No, +} + +impl IncrementCounter { + fn yes(&self) -> bool { + match self { + IncrementCounter::Yes => true, + IncrementCounter::No => false, + } + } +} + // The largest power of two less than or equal to `n`, used for left_len() // immediately below, and also directly in Hasher::update(). fn largest_power_of_two_leq(n: usize) -> usize { @@ -395,14 +376,13 @@ where fn compress_chunks_parallel( input: &[u8], key: &CVWords, - offset: u64, + chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert!(!input.is_empty(), "empty chunks below the root"); debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); - debug_assert_eq!(offset % CHUNK_LEN as u64, 0, "invalid offset"); let mut chunks_exact = input.chunks_exact(CHUNK_LEN); let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new(); @@ -412,8 +392,8 @@ fn compress_chunks_parallel( platform.hash_many( &chunks_array, key, - offset, - CHUNK_OFFSET_DELTAS, + chunk_counter, + IncrementCounter::Yes, flags, CHUNK_START, CHUNK_END, @@ -424,8 +404,8 @@ fn compress_chunks_parallel( // chunk (meaning the empty message) is a different codepath. let chunks_so_far = chunks_array.len(); if !chunks_exact.remainder().is_empty() { - let chunk_offset = offset + (chunks_so_far * CHUNK_LEN) as u64; - let mut chunk_state = ChunkState::new(key, chunk_offset, flags, platform); + let counter = chunk_counter + chunks_so_far as u64; + let mut chunk_state = ChunkState::new(key, counter, flags, platform); chunk_state.update(chunks_exact.remainder()); *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = chunk_state.output().chaining_value(); @@ -462,8 +442,8 @@ fn compress_parents_parallel( platform.hash_many( &parents_array, key, - 0, // Parents always use offset 0. - PARENT_OFFSET_DELTAS, + 0, // Parents always use counter 0. + IncrementCounter::No, flags | PARENT, 0, // Parents have no start flags. 0, // Parents have no end flags. @@ -496,7 +476,7 @@ fn compress_parents_parallel( fn compress_subtree_wide( input: &[u8], key: &CVWords, - offset: u64, + chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], @@ -505,7 +485,7 @@ fn compress_subtree_wide( // when it is 1. This allows Rayon the option of multi-threading even the // 2-chunk case, which can help performance on smaller platforms. if input.len() <= platform.simd_degree() * CHUNK_LEN { - return compress_chunks_parallel(input, key, offset, flags, platform, out); + return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing @@ -514,7 +494,7 @@ fn compress_subtree_wide( // of 3 or something, we'll need a more complicated strategy.) debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); let (left, right) = input.split_at(left_len(input.len())); - let right_offset = offset + left.len() as u64; + let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to // account for the special case of returning 2 outputs when the SIMD degree @@ -531,8 +511,8 @@ fn compress_subtree_wide( // Recurse! This uses multiple threads if the "rayon" feature is enabled. let (left_n, right_n) = join( - || compress_subtree_wide(left, key, offset, flags, platform, left_out), - || compress_subtree_wide(right, key, right_offset, flags, platform, right_out), + || compress_subtree_wide(left, key, chunk_counter, flags, platform, left_out), + || compress_subtree_wide(right, key, right_chunk_counter, flags, platform, right_out), ); // The special case again. If simd_degree=1, then we'll have left_n=1 and @@ -568,13 +548,14 @@ fn compress_subtree_wide( fn compress_subtree_to_parent_node( input: &[u8], key: &CVWords, - offset: u64, + chunk_counter: u64, flags: u8, platform: Platform, ) -> [u8; BLOCK_LEN] { debug_assert!(input.len() > CHUNK_LEN); let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let mut num_cvs = compress_subtree_wide(input, &key, offset, flags, platform, &mut cv_array); + let mut num_cvs = + compress_subtree_wide(input, &key, chunk_counter, flags, platform, &mut cv_array); debug_assert!(num_cvs >= 2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, @@ -607,7 +588,7 @@ fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { input_chaining_value: *key, block: compress_subtree_to_parent_node(input, key, 0, flags, platform), block_len: BLOCK_LEN as u8, - offset: 0, + counter: 0, flags: flags | PARENT, platform, } @@ -646,7 +627,7 @@ fn parent_node_output( input_chaining_value: *key, block, block_len: BLOCK_LEN as u8, - offset: 0, + counter: 0, flags: flags | PARENT, platform, } @@ -694,11 +675,6 @@ impl Hasher { Self::new_internal(&key_words, DERIVE_KEY) } - /// The total number of input bytes so far. - pub fn count(&self) -> u64 { - self.chunk_state.offset + self.chunk_state.len() as u64 - } - // See comment in push_cv. fn merge_cv_stack(&mut self, total_len: u64) { let post_merge_stack_len = total_len.count_ones() as usize; @@ -716,7 +692,7 @@ impl Hasher { } } - fn push_cv(&mut self, new_cv: &CVBytes, offset: u64) { + fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { // In reference_impl.rs, we merge the new CV with existing CVs from the // stack before pushing it. We can do that because we know more input // is coming, so we know none of the merges are root. @@ -739,10 +715,10 @@ impl Hasher { // merging with the new CV itself. // // We still use the "count the 1 bits" algorithm, adjusted slightly for - // this setting, using the offset (the start of the new CV's bytes) - // rather than the final total (the end of the new CV's bytes). That + // this setting, using the new chunk's counter numer (the previous + // total number of chunks) rather than new total number of chunks. That // algorithm is explained in detail in the spec. - self.merge_cv_stack(offset); + self.merge_cv_stack(chunk_counter); self.cv_stack.push(*new_cv); } @@ -762,9 +738,13 @@ impl Hasher { // Then we'll proceed to hashing whole chunks below. debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); let chunk_cv = self.chunk_state.output().chaining_value(); - self.push_cv(&chunk_cv, self.chunk_state.offset); - let new_offset = self.chunk_state.offset + CHUNK_LEN as u64; - self.chunk_state.reset(&self.key, new_offset); + self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); + self.chunk_state = ChunkState::new( + &self.key, + self.chunk_state.chunk_counter + 1, + self.chunk_state.flags, + self.chunk_state.platform, + ); } else { return self; } @@ -786,32 +766,33 @@ impl Hasher { while input.len() > CHUNK_LEN { debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); - debug_assert_eq!(self.chunk_state.offset % CHUNK_LEN as u64, 0); let mut subtree_len = largest_power_of_two_leq(input.len()); + let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; // Shrink the subtree_len until it evenly divides the count so far. // We know it's a power of 2, so we can use a bitmask rather than // the more expensive modulus operation. Note that if the caller // consistently passes power-of-2 inputs of the same size (as is // hopefully typical), we'll always skip over this loop. - while (subtree_len - 1) as u64 & self.chunk_state.offset != 0 { + while (subtree_len - 1) as u64 & count_so_far != 0 { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash // that one chunk by itself. Otherwise, compress the subtree into a // pair of CVs. + let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; if subtree_len <= CHUNK_LEN { debug_assert_eq!(subtree_len, CHUNK_LEN); self.push_cv( &ChunkState::new( &self.key, - self.chunk_state.offset, + self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ) .update(&input[..subtree_len]) .output() .chaining_value(), - self.chunk_state.offset, + self.chunk_state.chunk_counter, ); } else { // This is the high-performance happy path, though getting here @@ -819,7 +800,7 @@ impl Hasher { let cv_pair = compress_subtree_to_parent_node( &input[..subtree_len], &self.key, - self.chunk_state.offset, + self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ); @@ -828,10 +809,13 @@ impl Hasher { // Push the two CVs we received into the CV stack in order. Because // the stack merges lazily, this guarantees we aren't merging the // root. - self.push_cv(left_cv, self.chunk_state.offset); - self.push_cv(right_cv, self.chunk_state.offset + (subtree_len as u64 / 2)); + self.push_cv(left_cv, self.chunk_state.chunk_counter); + self.push_cv( + right_cv, + self.chunk_state.chunk_counter + (subtree_chunks / 2), + ); } - self.chunk_state.offset += subtree_len as u64; + self.chunk_state.chunk_counter += subtree_chunks; input = &input[subtree_len..]; } @@ -842,7 +826,7 @@ impl Hasher { // Having added some input to the chunk_state, we know what's in // the CV stack won't become the root node, and we can do an extra // merge. This simplifies finalize(). - self.merge_cv_stack(self.chunk_state.offset); + self.merge_cv_stack(self.chunk_state.chunk_counter); } self @@ -853,7 +837,7 @@ impl Hasher { // also. Convert it directly into an Output. Otherwise, we need to // merge subtrees below. if self.cv_stack.is_empty() { - debug_assert_eq!(self.chunk_state.offset, 0); + debug_assert_eq!(self.chunk_state.chunk_counter, 0); return self.chunk_state.output(); } @@ -874,7 +858,7 @@ impl Hasher { if self.chunk_state.len() > 0 { debug_assert_eq!( self.cv_stack.len(), - self.chunk_state.offset.count_ones() as usize, + self.chunk_state.chunk_counter.count_ones() as usize, "cv stack does not need a merge" ); output = self.chunk_state.output(); @@ -930,10 +914,8 @@ impl fmt::Debug for Hasher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, - "Hasher {{ count: {}, flags: {:?}, platform: {:?} }}", - self.count(), - self.chunk_state.flags, - self.chunk_state.platform + "Hasher {{ flags: {:?}, platform: {:?} }}", + self.chunk_state.flags, self.chunk_state.platform ) } } @@ -982,19 +964,19 @@ impl OutputReader { buf = &mut buf[take..]; self.position_within_block += take as u8; if self.position_within_block == BLOCK_LEN as u8 { - self.inner.offset += BLOCK_LEN as u64; + self.inner.counter += 1; self.position_within_block = 0; } } } pub fn position(&self) -> u64 { - self.inner.offset + self.position_within_block as u64 + self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 } pub fn set_position(&mut self, position: u64) { self.position_within_block = (position % BLOCK_LEN as u64) as u8; - self.inner.offset = position - self.position_within_block as u64; + self.inner.counter = position / BLOCK_LEN as u64; } } diff --git a/src/platform.rs b/src/platform.rs index 66d70f6..abf9d30 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -1,4 +1,4 @@ -use crate::{portable, CVWords, OffsetDeltas, BLOCK_LEN}; +use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; #[cfg(feature = "c_avx512")] @@ -108,25 +108,25 @@ impl Platform { cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) { match self { - Platform::Portable => portable::compress_in_place(cv, block, block_len, offset, flags), + Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress_in_place(cv, block, block_len, offset, flags) + sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(feature = "c_avx512")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::compress_in_place(cv, block, block_len, offset, flags) + c_avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(feature = "c_neon")] - Platform::NEON => portable::compress_in_place(cv, block, block_len, offset, flags), + Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), } } @@ -135,25 +135,25 @@ impl Platform { cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [u8; 64] { match self { - Platform::Portable => portable::compress_xof(cv, block, block_len, offset, flags), + Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress_xof(cv, block, block_len, offset, flags) + sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(feature = "c_avx512")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::compress_xof(cv, block, block_len, offset, flags) + c_avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(feature = "c_neon")] - Platform::NEON => portable::compress_xof(cv, block, block_len, offset, flags), + Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), } } @@ -171,8 +171,8 @@ impl Platform { &self, inputs: &[&A], key: &CVWords, - offset: u64, - offset_deltas: &OffsetDeltas, + counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -182,8 +182,8 @@ impl Platform { Platform::Portable => portable::hash_many( inputs, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, @@ -195,8 +195,8 @@ impl Platform { sse41::hash_many( inputs, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, @@ -209,8 +209,8 @@ impl Platform { avx2::hash_many( inputs, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, @@ -224,8 +224,8 @@ impl Platform { c_avx512::hash_many( inputs, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, @@ -238,8 +238,8 @@ impl Platform { c_neon::hash_many( inputs, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, diff --git a/src/portable.rs b/src/portable.rs index fa0e17d..0a569ce 100644 --- a/src/portable.rs +++ b/src/portable.rs @@ -1,5 +1,6 @@ use crate::{ - offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, }; use arrayref::{array_mut_ref, array_ref}; @@ -38,7 +39,7 @@ fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [u32; 16] { let block_words = crate::platform::words_from_le_bytes_64(block); @@ -56,8 +57,8 @@ fn compress_pre( IV[1], IV[2], IV[3], - offset_low(offset), - offset_high(offset), + counter_low(counter), + counter_high(counter), block_len as u32, flags as u32, ]; @@ -77,10 +78,10 @@ pub fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) { - let state = compress_pre(cv, block, block_len, offset, flags); + let state = compress_pre(cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; cv[1] = state[1] ^ state[9]; @@ -96,10 +97,10 @@ pub fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [u8; 64] { - let mut state = compress_pre(cv, block, block_len, offset, flags); + let mut state = compress_pre(cv, block, block_len, counter, flags); state[0] ^= state[8]; state[1] ^= state[9]; state[2] ^= state[10]; @@ -122,7 +123,7 @@ pub fn compress_xof( pub fn hash1<A: arrayvec::Array<Item = u8>>( input: &A, key: &CVWords, - offset: u64, + counter: u64, flags: u8, flags_start: u8, flags_end: u8, @@ -140,7 +141,7 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, - offset, + counter, block_flags, ); block_flags = flags; @@ -152,8 +153,8 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>( pub fn hash_many<A: arrayvec::Array<Item = u8>>( inputs: &[&A], key: &CVWords, - mut offset: u64, - offset_deltas: &OffsetDeltas, + mut counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -164,13 +165,15 @@ pub fn hash_many<A: arrayvec::Array<Item = u8>>( hash1( input, key, - offset, + counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); - offset += offset_deltas[1]; + if increment_counter.yes() { + counter += 1; + } } } diff --git a/src/sse41.rs b/src/sse41.rs index e45b22e..77cb27e 100644 --- a/src/sse41.rs +++ b/src/sse41.rs @@ -4,7 +4,8 @@ use core::arch::x86::*; use core::arch::x86_64::*; use crate::{ - offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; @@ -129,15 +130,15 @@ unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [__m128i; 4] { let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row4 = &mut set4( - offset_low(offset), - offset_high(offset), + counter_low(counter), + counter_high(counter), block_len as u32, flags as u32, ); @@ -313,10 +314,10 @@ pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) { - let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, offset, flags); + let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8); } @@ -326,11 +327,11 @@ pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [u8; 64] { let [mut row1, mut row2, mut row3, mut row4] = - compress_pre(cv, block, block_len, offset, flags); + compress_pre(cv, block, block_len, counter, flags); row1 = xor(row1, row3); row2 = xor(row2, row4); row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8)); @@ -506,19 +507,20 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) } #[inline(always)] -unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m128i, __m128i) { +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( - offset_low(offset + offset_deltas[0]), - offset_low(offset + offset_deltas[1]), - offset_low(offset + offset_deltas[2]), - offset_low(offset + offset_deltas[3]), + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), ), set4( - offset_high(offset + offset_deltas[0]), - offset_high(offset + offset_deltas[1]), - offset_high(offset + offset_deltas[2]), - offset_high(offset + offset_deltas[3]), + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), ), ) } @@ -528,8 +530,8 @@ pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, - offset: u64, - offset_deltas: &OffsetDeltas, + counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -545,7 +547,7 @@ pub unsafe fn hash4( set1(key[6]), set1(key[7]), ]; - let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas); + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { @@ -573,8 +575,8 @@ pub unsafe fn hash4( set1(IV[1]), set1(IV[2]), set1(IV[3]), - offset_low_vec, - offset_high_vec, + counter_low_vec, + counter_high_vec, block_len_vec, block_flags_vec, ]; @@ -616,7 +618,7 @@ pub unsafe fn hash4( unsafe fn hash1<A: arrayvec::Array<Item = u8>>( input: &A, key: &CVWords, - offset: u64, + counter: u64, flags: u8, flags_start: u8, flags_end: u8, @@ -634,7 +636,7 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, - offset, + counter, block_flags, ); block_flags = flags; @@ -647,8 +649,8 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>( pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( mut inputs: &[&A], key: &CVWords, - mut offset: u64, - offset_deltas: &OffsetDeltas, + mut counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -664,28 +666,32 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( input_ptrs, blocks, key, - offset, - offset_deltas, + counter, + increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); + if increment_counter.yes() { + counter += DEGREE as u64; + } inputs = &inputs[DEGREE..]; - offset += DEGREE as u64 * offset_deltas[1]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, - offset, + counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); - offset += offset_deltas[1]; + if increment_counter.yes() { + counter += 1; + } } } diff --git a/src/test.rs b/src/test.rs index 404079c..e7fe96e 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1,4 +1,4 @@ -use crate::{CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use arrayref::array_ref; use arrayvec::ArrayVec; use core::usize; @@ -48,13 +48,13 @@ pub fn paint_test_input(buf: &mut [u8]) { } type CompressInPlaceFn = - unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8); + unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); type CompressXofFn = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, - offset: u64, + counter: u64, flags: u8, ) -> [u8; 64]; @@ -64,18 +64,18 @@ pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn let block_len: u8 = 61; let mut block = [0; BLOCK_LEN]; paint_test_input(&mut block[..block_len as usize]); - // Use an offset with set bits in both 32-bit words. - let offset = ((5 * CHUNK_LEN as u64) << 32) + 6 * CHUNK_LEN as u64; + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; let portable_out = - crate::portable::compress_xof(&initial_state, &block, block_len, offset as u64, flags); + crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); let mut test_state = initial_state; - unsafe { compress_in_place_fn(&mut test_state, &block, block_len, offset as u64, flags) }; + unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); let test_xof = - unsafe { compress_xof_fn(&initial_state, &block, block_len, offset as u64, flags) }; + unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; assert_eq!(&portable_out[..32], &test_state_bytes[..]); assert_eq!(&portable_out[..], &test_xof[..]); @@ -84,8 +84,8 @@ pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn type HashManyFn<A> = unsafe fn( inputs: &[&A], key: &CVWords, - offset: u64, - offset_deltas: &OffsetDeltas, + counter: u64, + increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, @@ -101,8 +101,8 @@ pub fn test_hash_many_fn( const NUM_INPUTS: usize = 31; let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; crate::test::paint_test_input(&mut input_buf); - // An offset just prior to u32::MAX. - let offset = (1 << 32) - CHUNK_LEN as u64; + // A counter just prior to u32::MAX. + let counter = (1u64 << 32) - 1; // First hash chunks. let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new(); @@ -113,8 +113,8 @@ pub fn test_hash_many_fn( crate::portable::hash_many( &chunks, &TEST_KEY_WORDS, - offset, - crate::CHUNK_OFFSET_DELTAS, + counter, + IncrementCounter::Yes, crate::DERIVE_KEY, crate::CHUNK_START, crate::CHUNK_END, @@ -126,8 +126,8 @@ pub fn test_hash_many_fn( hash_many_chunks_fn( &chunks[..], &TEST_KEY_WORDS, - offset, - crate::CHUNK_OFFSET_DELTAS, + counter, + IncrementCounter::Yes, crate::DERIVE_KEY, crate::CHUNK_START, crate::CHUNK_END, @@ -153,7 +153,7 @@ pub fn test_hash_many_fn( &parents, &TEST_KEY_WORDS, 0, - crate::PARENT_OFFSET_DELTAS, + IncrementCounter::No, crate::DERIVE_KEY | crate::PARENT, 0, 0, @@ -166,7 +166,7 @@ pub fn test_hash_many_fn( &parents[..], &TEST_KEY_WORDS, 0, - crate::PARENT_OFFSET_DELTAS, + IncrementCounter::No, crate::DERIVE_KEY | crate::PARENT, 0, 0, @@ -202,10 +202,10 @@ fn test_reference_impl_size() { } #[test] -fn test_offset_words() { - let offset: u64 = (1 << 32) + 2; - assert_eq!(crate::offset_low(offset), 2); - assert_eq!(crate::offset_high(offset), 1); +fn test_counter_words() { + let counter: u64 = (1 << 32) + 2; + assert_eq!(crate::counter_low(counter), 2); + assert_eq!(crate::counter_high(counter), 1); } #[test] |
