diff options
| author | Jack O'Connor <[email protected]> | 2022-07-07 11:22:39 -0700 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-07-11 15:43:24 -0400 |
| commit | 0a721ef5690600bb10608b02cd57ec4051b4c8cf (patch) | |
| tree | df0d16c5dc43850cc7261e337995b9215f1579ef | |
| parent | 49a04ca23d10662cc7e5a38fb40f0955224852b5 (diff) | |
add more undocumented guts helpers for chunk group hashingmore_guts
| -rw-r--r-- | src/guts.rs | 120 | ||||
| -rw-r--r-- | src/lib.rs | 62 |
2 files changed, 157 insertions, 25 deletions
diff --git a/src/guts.rs b/src/guts.rs index ecde326..4f161f1 100644 --- a/src/guts.rs +++ b/src/guts.rs @@ -6,6 +6,8 @@ //! We could stabilize something like this module in the future. If you have a //! use case for it, please let us know by filing a GitHub issue. +use crate::{Hash, Hasher}; + pub const BLOCK_LEN: usize = 64; pub const CHUNK_LEN: usize = 1024; @@ -35,7 +37,7 @@ impl ChunkState { self } - pub fn finalize(&self, is_root: bool) -> crate::Hash { + pub fn finalize(&self, is_root: bool) -> Hash { let output = self.0.output(); if is_root { output.root_hash() @@ -47,11 +49,7 @@ impl ChunkState { // As above, this currently assumes the regular hash mode. If an incremental // user needs keyed_hash or derive_key, we can add that. -pub fn parent_cv( - left_child: &crate::Hash, - right_child: &crate::Hash, - is_root: bool, -) -> crate::Hash { +pub fn parent_cv(left_child: &Hash, right_child: &Hash, is_root: bool) -> Hash { let output = crate::parent_node_output( left_child.as_bytes(), right_child.as_bytes(), @@ -66,6 +64,30 @@ pub fn parent_cv( } } +/// Adjust a regular Hasher so that it can hash a subtree whose starting byte offset is something +/// other than zero. Morally speaking, this parameter should be in a Hasher constructor function, +/// but those are public APIs, and I don't want to complicate them while this is still unstable. +/// This should only be called immediately after a Hasher is constructed, and we do our best to +/// assert that rule. (Changing this parameter after some input has already been compressed would +/// lead to a garbage hash.) Subtree hashes that aren't the root must use finalize_nonroot() below, +/// and we also do our best to assert that. +pub fn set_offset(hasher: &mut Hasher, starting_offset: u64) { + assert_eq!(0, hasher.cv_stack.len()); + assert_eq!(0, hasher.chunk_state.len()); + assert_eq!(0, starting_offset % CHUNK_LEN as u64); + hasher.initial_chunk_counter = starting_offset / CHUNK_LEN as u64; + hasher.chunk_state.chunk_counter = hasher.initial_chunk_counter; +} + +/// Finalize a Hasher as a non-root subtree. Callers using set_offset() above must use this instead +/// of the regular .finalize() method for any non-root subtrees, or else they'll get garbage hash. +/// BUT BE CAREFUL: Whatever your subtree size might be, you probably need to account for the case +/// where you have only one subtree (with offset zero), and in that case it *does* need to be +/// root-finalized. +pub fn finalize_nonroot(hasher: &Hasher) -> Hash { + Hash(hasher.final_output().chaining_value()) +} + #[cfg(test)] mod test { use super::*; @@ -80,7 +102,7 @@ mod test { #[test] fn test_parents() { - let mut hasher = crate::Hasher::new(); + let mut hasher = Hasher::new(); let mut buf = [0; crate::CHUNK_LEN]; buf[0] = 'a' as u8; @@ -98,4 +120,88 @@ mod test { let root = parent_cv(&parent, &chunk2_cv, true); assert_eq!(hasher.finalize(), root); } + + #[test] + fn test_offset() { + let mut input = [0; 12 * CHUNK_LEN + 1]; + crate::test::paint_test_input(&mut input); + + let mut hasher0 = Hasher::new(); + set_offset(&mut hasher0, 0 * CHUNK_LEN as u64); + hasher0.update(&input[0 * CHUNK_LEN..][..4 * CHUNK_LEN]); + let subtree0 = finalize_nonroot(&hasher0); + + let mut hasher1 = Hasher::new(); + set_offset(&mut hasher1, 4 * CHUNK_LEN as u64); + hasher1.update(&input[4 * CHUNK_LEN..][..4 * CHUNK_LEN]); + let subtree1 = finalize_nonroot(&hasher1); + + let mut hasher2 = Hasher::new(); + set_offset(&mut hasher2, 8 * CHUNK_LEN as u64); + hasher2.update(&input[8 * CHUNK_LEN..][..4 * CHUNK_LEN]); + let subtree2 = finalize_nonroot(&hasher2); + + let mut hasher3 = Hasher::new(); + set_offset(&mut hasher3, 12 * CHUNK_LEN as u64); + hasher3.update(&input[12 * CHUNK_LEN..][..1]); + let subtree3 = finalize_nonroot(&hasher3); + + let parent0 = parent_cv(&subtree0, &subtree1, false); + let parent1 = parent_cv(&subtree2, &subtree3, false); + let root = parent_cv(&parent0, &parent1, true); + + assert_eq!(crate::hash(&input), root); + } + + #[test] + #[should_panic] + fn test_odd_offset() { + let mut hasher = Hasher::new(); + set_offset(&mut hasher, 1); + } + + #[test] + #[should_panic] + fn test_nonempty_offset_short() { + let mut hasher = Hasher::new(); + hasher.update(b"hello"); + set_offset(&mut hasher, 0); + } + + #[test] + #[should_panic] + fn test_nonempty_offset_long() { + let mut hasher = Hasher::new(); + hasher.update(&[0; 4096]); + set_offset(&mut hasher, 0); + } + + #[test] + #[should_panic] + #[cfg(debug_assertions)] + fn test_offset_then_update_too_much() { + let mut hasher = Hasher::new(); + set_offset(&mut hasher, 12 * CHUNK_LEN as u64); + hasher.update(&[0; 4 * CHUNK_LEN + 1]); + } + + #[test] + #[should_panic] + #[cfg(debug_assertions)] + fn test_offset_then_root_finalize_xof() { + let mut hasher = Hasher::new(); + set_offset(&mut hasher, 2 * CHUNK_LEN as u64); + hasher.update(&[0; 2 * CHUNK_LEN]); + hasher.finalize_xof(); + } + + #[test] + #[should_panic] + #[cfg(debug_assertions)] + fn test_offset_then_root_finalize() { + let mut hasher = Hasher::new(); + set_offset(&mut hasher, 2 * CHUNK_LEN as u64); + hasher.update(&[0; 2 * CHUNK_LEN]); + hasher.finalize(); + } } @@ -939,6 +939,8 @@ fn parent_node_output( pub struct Hasher { key: CVWords, chunk_state: ChunkState, + // Always zero, unless you use guts::set_offset to change it. + initial_chunk_counter: u64, // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk // requires a 4th entry, rather than merging everything down to 1, because @@ -952,6 +954,7 @@ impl Hasher { Self { key: *key, chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), + initial_chunk_counter: 0, cv_stack: ArrayVec::new(), } } @@ -998,6 +1001,10 @@ impl Hasher { self } + fn chunks_in_cv_stack(&self) -> u64 { + self.chunk_state.chunk_counter - self.initial_chunk_counter + } + // As described in push_cv() below, we do "lazy merging", delaying merges // until right before the next CV is about to be added. This is different // from the reference implementation. Another difference is that we aren't @@ -1009,8 +1016,8 @@ impl Hasher { // the CV on top of the stack. The principle is the same: each CV that // should remain in the stack is represented by a 1-bit in the total number // of chunks (or bytes) so far. - fn merge_cv_stack(&mut self, total_len: u64) { - let post_merge_stack_len = total_len.count_ones() as usize; + fn merge_cv_stack(&mut self) { + let post_merge_stack_len = self.chunks_in_cv_stack().count_ones() as usize; while self.cv_stack.len() > post_merge_stack_len { let right_child = self.cv_stack.pop().unwrap(); let left_child = self.cv_stack.pop().unwrap(); @@ -1058,9 +1065,10 @@ impl Hasher { // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when // we're hashing an input all-at-once.) - fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { - self.merge_cv_stack(chunk_counter); + fn push_cv(&mut self, new_cv: &CVBytes, num_chunks: u64) { + self.merge_cv_stack(); self.cv_stack.push(*new_cv); + self.chunk_state.chunk_counter += num_chunks; } /// Add input bytes to the hash state. You can call this any number of @@ -1106,6 +1114,20 @@ impl Hasher { } fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self { + // In debug mode, check that this update doesn't exceed the maximum size of the subtree. + // This is only relevant when the caller uses guts::set_offset to set a non-zero starting + // offset. + if self.initial_chunk_counter != 0 { + // For example, if the starting chunk counter is 13 (0b1101), then this subtree can + // only be a single chunk. But if it's 12 (0b1100), it can be up to four chunks. Note + // that this shift would overflow and panic if initial_chunk_counter was zero, and we + // rely on the if-condition above to prevent that. + let max_subtree_chunks = 1 << self.initial_chunk_counter.trailing_zeros(); + let max_subtree_len = CHUNK_LEN as u64 * max_subtree_chunks; + let new_subtree_len = self.count() + input.len() as u64; + debug_assert!(new_subtree_len <= max_subtree_len); + } + // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. if self.chunk_state.len() > 0 { @@ -1119,10 +1141,10 @@ impl Hasher { // Then we'll proceed to hashing whole chunks below. debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); let chunk_cv = self.chunk_state.output().chaining_value(); - self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); + self.push_cv(&chunk_cv, 1); self.chunk_state = ChunkState::new( &self.key, - self.chunk_state.chunk_counter + 1, + self.chunk_state.chunk_counter, // incremented in push_cv self.chunk_state.flags, self.chunk_state.platform, ); @@ -1185,7 +1207,7 @@ impl Hasher { .update(&input[..subtree_len]) .output() .chaining_value(), - self.chunk_state.chunk_counter, + subtree_chunks, ); } else { // This is the high-performance happy path, though getting here @@ -1202,13 +1224,9 @@ impl Hasher { // Push the two CVs we received into the CV stack in order. Because // the stack merges lazily, this guarantees we aren't merging the // root. - self.push_cv(left_cv, self.chunk_state.chunk_counter); - self.push_cv( - right_cv, - self.chunk_state.chunk_counter + (subtree_chunks / 2), - ); + self.push_cv(left_cv, subtree_chunks / 2); + self.push_cv(right_cv, subtree_chunks / 2); } - self.chunk_state.chunk_counter += subtree_chunks; input = &input[subtree_len..]; } @@ -1219,7 +1237,7 @@ impl Hasher { // Having added some input to the chunk_state, we know what's in // the CV stack won't become the root node, and we can do an extra // merge. This simplifies finalize(). - self.merge_cv_stack(self.chunk_state.chunk_counter); + self.merge_cv_stack(); } self @@ -1230,7 +1248,7 @@ impl Hasher { // also. Convert it directly into an Output. Otherwise, we need to // merge subtrees below. if self.cv_stack.is_empty() { - debug_assert_eq!(self.chunk_state.chunk_counter, 0); + debug_assert_eq!(self.chunks_in_cv_stack(), 0); return self.chunk_state.output(); } @@ -1251,8 +1269,8 @@ impl Hasher { if self.chunk_state.len() > 0 { debug_assert_eq!( self.cv_stack.len(), - self.chunk_state.chunk_counter.count_ones() as usize, - "cv stack does not need a merge" + self.chunks_in_cv_stack().count_ones() as usize, + "cv stack should not need a merge" ); output = self.chunk_state.output(); } else { @@ -1285,6 +1303,10 @@ impl Hasher { /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. pub fn finalize(&self) -> Hash { + debug_assert_eq!( + 0, self.initial_chunk_counter, + "the root subtree must start at offset 0", + ); self.final_output().root_hash() } @@ -1296,12 +1318,16 @@ impl Hasher { /// /// [`OutputReader`]: struct.OutputReader.html pub fn finalize_xof(&self) -> OutputReader { + debug_assert_eq!( + 0, self.initial_chunk_counter, + "the root subtree must start at offset 0", + ); OutputReader::new(self.final_output()) } /// Return the total number of bytes hashed so far. pub fn count(&self) -> u64 { - self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 + self.chunks_in_cv_stack() * CHUNK_LEN as u64 + self.chunk_state.len() as u64 } } |
