add more undocumented guts helpers for chunk group hashingmore_guts

author: Jack O'Connor <[email protected]> 2022-07-07 11:22:39 -0700
committer: Jack O'Connor <[email protected]> 2022-07-11 15:43:24 -0400
commit: 0a721ef5690600bb10608b02cd57ec4051b4c8cf (patch)
tree: df0d16c5dc43850cc7261e337995b9215f1579ef
parent: 49a04ca23d10662cc7e5a38fb40f0955224852b5 (diff)
2 files changed, 157 insertions, 25 deletions
diff --git a/src/guts.rs b/src/guts.rs
index ecde326..4f161f1 100644
--- a/src/guts.rs
+++ b/src/guts.rs
@@ -6,6 +6,8 @@
 //! We could stabilize something like this module in the future. If you have a
 //! use case for it, please let us know by filing a GitHub issue.
 
+use crate::{Hash, Hasher};
+
 pub const BLOCK_LEN: usize = 64;
 pub const CHUNK_LEN: usize = 1024;
 
@@ -35,7 +37,7 @@ impl ChunkState {
         self
     }
 
-    pub fn finalize(&self, is_root: bool) -> crate::Hash {
+    pub fn finalize(&self, is_root: bool) -> Hash {
         let output = self.0.output();
         if is_root {
             output.root_hash()
@@ -47,11 +49,7 @@ impl ChunkState {
 
 // As above, this currently assumes the regular hash mode. If an incremental
 // user needs keyed_hash or derive_key, we can add that.
-pub fn parent_cv(
-    left_child: &crate::Hash,
-    right_child: &crate::Hash,
-    is_root: bool,
-) -> crate::Hash {
+pub fn parent_cv(left_child: &Hash, right_child: &Hash, is_root: bool) -> Hash {
     let output = crate::parent_node_output(
         left_child.as_bytes(),
         right_child.as_bytes(),
@@ -66,6 +64,30 @@ pub fn parent_cv(
     }
 }
 
+/// Adjust a regular Hasher so that it can hash a subtree whose starting byte offset is something
+/// other than zero. Morally speaking, this parameter should be in a Hasher constructor function,
+/// but those are public APIs, and I don't want to complicate them while this is still unstable.
+/// This should only be called immediately after a Hasher is constructed, and we do our best to
+/// assert that rule. (Changing this parameter after some input has already been compressed would
+/// lead to a garbage hash.) Subtree hashes that aren't the root must use finalize_nonroot() below,
+/// and we also do our best to assert that.
+pub fn set_offset(hasher: &mut Hasher, starting_offset: u64) {
+    assert_eq!(0, hasher.cv_stack.len());
+    assert_eq!(0, hasher.chunk_state.len());
+    assert_eq!(0, starting_offset % CHUNK_LEN as u64);
+    hasher.initial_chunk_counter = starting_offset / CHUNK_LEN as u64;
+    hasher.chunk_state.chunk_counter = hasher.initial_chunk_counter;
+}
+
+/// Finalize a Hasher as a non-root subtree. Callers using set_offset() above must use this instead
+/// of the regular .finalize() method for any non-root subtrees, or else they'll get garbage hash.
+/// BUT BE CAREFUL: Whatever your subtree size might be, you probably need to account for the case
+/// where you have only one subtree (with offset zero), and in that case it *does* need to be
+/// root-finalized.
+pub fn finalize_nonroot(hasher: &Hasher) -> Hash {
+    Hash(hasher.final_output().chaining_value())
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -80,7 +102,7 @@ mod test {
 
     #[test]
     fn test_parents() {
-        let mut hasher = crate::Hasher::new();
+        let mut hasher = Hasher::new();
         let mut buf = [0; crate::CHUNK_LEN];
 
         buf[0] = 'a' as u8;
@@ -98,4 +120,88 @@ mod test {
         let root = parent_cv(&parent, &chunk2_cv, true);
         assert_eq!(hasher.finalize(), root);
     }
+
+    #[test]
+    fn test_offset() {
+        let mut input = [0; 12 * CHUNK_LEN + 1];
+        crate::test::paint_test_input(&mut input);
+
+        let mut hasher0 = Hasher::new();
+        set_offset(&mut hasher0, 0 * CHUNK_LEN as u64);
+        hasher0.update(&input[0 * CHUNK_LEN..][..4 * CHUNK_LEN]);
+        let subtree0 = finalize_nonroot(&hasher0);
+
+        let mut hasher1 = Hasher::new();
+        set_offset(&mut hasher1, 4 * CHUNK_LEN as u64);
+        hasher1.update(&input[4 * CHUNK_LEN..][..4 * CHUNK_LEN]);
+        let subtree1 = finalize_nonroot(&hasher1);
+
+        let mut hasher2 = Hasher::new();
+        set_offset(&mut hasher2, 8 * CHUNK_LEN as u64);
+        hasher2.update(&input[8 * CHUNK_LEN..][..4 * CHUNK_LEN]);
+        let subtree2 = finalize_nonroot(&hasher2);
+
+        let mut hasher3 = Hasher::new();
+        set_offset(&mut hasher3, 12 * CHUNK_LEN as u64);
+        hasher3.update(&input[12 * CHUNK_LEN..][..1]);
+        let subtree3 = finalize_nonroot(&hasher3);
+
+        let parent0 = parent_cv(&subtree0, &subtree1, false);
+        let parent1 = parent_cv(&subtree2, &subtree3, false);
+        let root = parent_cv(&parent0, &parent1, true);
+
+        assert_eq!(crate::hash(&input), root);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_odd_offset() {
+        let mut hasher = Hasher::new();
+        set_offset(&mut hasher, 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_nonempty_offset_short() {
+        let mut hasher = Hasher::new();
+        hasher.update(b"hello");
+        set_offset(&mut hasher, 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_nonempty_offset_long() {
+        let mut hasher = Hasher::new();
+        hasher.update(&[0; 4096]);
+        set_offset(&mut hasher, 0);
+    }
+
+    #[test]
+    #[should_panic]
+    #[cfg(debug_assertions)]
+    fn test_offset_then_update_too_much() {
+        let mut hasher = Hasher::new();
+        set_offset(&mut hasher, 12 * CHUNK_LEN as u64);
+        hasher.update(&[0; 4 * CHUNK_LEN + 1]);
+    }
+
+    #[test]
+    #[should_panic]
+    #[cfg(debug_assertions)]
+    fn test_offset_then_root_finalize_xof() {
+        let mut hasher = Hasher::new();
+        set_offset(&mut hasher, 2 * CHUNK_LEN as u64);
+        hasher.update(&[0; 2 * CHUNK_LEN]);
+        hasher.finalize_xof();
+    }
+
+    #[test]
+    #[should_panic]
+    #[cfg(debug_assertions)]
+    fn test_offset_then_root_finalize() {
+        let mut hasher = Hasher::new();
+        set_offset(&mut hasher, 2 * CHUNK_LEN as u64);
+        hasher.update(&[0; 2 * CHUNK_LEN]);
+        hasher.finalize();
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 9fc71ae..c3d5b0f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -939,6 +939,8 @@ fn parent_node_output(
 pub struct Hasher {
     key: CVWords,
     chunk_state: ChunkState,
+    // Always zero, unless you use guts::set_offset to change it.
+    initial_chunk_counter: u64,
     // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
     // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
     // requires a 4th entry, rather than merging everything down to 1, because
@@ -952,6 +954,7 @@ impl Hasher {
         Self {
             key: *key,
             chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+            initial_chunk_counter: 0,
             cv_stack: ArrayVec::new(),
         }
     }
@@ -998,6 +1001,10 @@ impl Hasher {
         self
     }
 
+    fn chunks_in_cv_stack(&self) -> u64 {
+        self.chunk_state.chunk_counter - self.initial_chunk_counter
+    }
+
     // As described in push_cv() below, we do "lazy merging", delaying merges
     // until right before the next CV is about to be added. This is different
     // from the reference implementation. Another difference is that we aren't
@@ -1009,8 +1016,8 @@ impl Hasher {
     // the CV on top of the stack. The principle is the same: each CV that
     // should remain in the stack is represented by a 1-bit in the total number
     // of chunks (or bytes) so far.
-    fn merge_cv_stack(&mut self, total_len: u64) {
-        let post_merge_stack_len = total_len.count_ones() as usize;
+    fn merge_cv_stack(&mut self) {
+        let post_merge_stack_len = self.chunks_in_cv_stack().count_ones() as usize;
         while self.cv_stack.len() > post_merge_stack_len {
             let right_child = self.cv_stack.pop().unwrap();
             let left_child = self.cv_stack.pop().unwrap();
@@ -1058,9 +1065,10 @@ impl Hasher {
     // merging with each of them separately, so that the second CV will always
     // remain unmerged. (That also helps us support extendable output when
     // we're hashing an input all-at-once.)
-    fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
-        self.merge_cv_stack(chunk_counter);
+    fn push_cv(&mut self, new_cv: &CVBytes, num_chunks: u64) {
+        self.merge_cv_stack();
         self.cv_stack.push(*new_cv);
+        self.chunk_state.chunk_counter += num_chunks;
     }
 
     /// Add input bytes to the hash state. You can call this any number of
@@ -1106,6 +1114,20 @@ impl Hasher {
     }
 
     fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
+        // In debug mode, check that this update doesn't exceed the maximum size of the subtree.
+        // This is only relevant when the caller uses guts::set_offset to set a non-zero starting
+        // offset.
+        if self.initial_chunk_counter != 0 {
+            // For example, if the starting chunk counter is 13 (0b1101), then this subtree can
+            // only be a single chunk. But if it's 12 (0b1100), it can be up to four chunks. Note
+            // that this shift would overflow and panic if initial_chunk_counter was zero, and we
+            // rely on the if-condition above to prevent that.
+            let max_subtree_chunks = 1 << self.initial_chunk_counter.trailing_zeros();
+            let max_subtree_len = CHUNK_LEN as u64 * max_subtree_chunks;
+            let new_subtree_len = self.count() + input.len() as u64;
+            debug_assert!(new_subtree_len <= max_subtree_len);
+        }
+
         // If we have some partial chunk bytes in the internal chunk_state, we
         // need to finish that chunk first.
         if self.chunk_state.len() > 0 {
@@ -1119,10 +1141,10 @@ impl Hasher {
                 // Then we'll proceed to hashing whole chunks below.
                 debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
                 let chunk_cv = self.chunk_state.output().chaining_value();
-                self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
+                self.push_cv(&chunk_cv, 1);
                 self.chunk_state = ChunkState::new(
                     &self.key,
-                    self.chunk_state.chunk_counter + 1,
+                    self.chunk_state.chunk_counter, // incremented in push_cv
                     self.chunk_state.flags,
                     self.chunk_state.platform,
                 );
@@ -1185,7 +1207,7 @@ impl Hasher {
                     .update(&input[..subtree_len])
                     .output()
                     .chaining_value(),
-                    self.chunk_state.chunk_counter,
+                    subtree_chunks,
                 );
             } else {
                 // This is the high-performance happy path, though getting here
@@ -1202,13 +1224,9 @@ impl Hasher {
                 // Push the two CVs we received into the CV stack in order. Because
                 // the stack merges lazily, this guarantees we aren't merging the
                 // root.
-                self.push_cv(left_cv, self.chunk_state.chunk_counter);
-                self.push_cv(
-                    right_cv,
-                    self.chunk_state.chunk_counter + (subtree_chunks / 2),
-                );
+                self.push_cv(left_cv, subtree_chunks / 2);
+                self.push_cv(right_cv, subtree_chunks / 2);
             }
-            self.chunk_state.chunk_counter += subtree_chunks;
             input = &input[subtree_len..];
         }
 
@@ -1219,7 +1237,7 @@ impl Hasher {
             // Having added some input to the chunk_state, we know what's in
             // the CV stack won't become the root node, and we can do an extra
             // merge. This simplifies finalize().
-            self.merge_cv_stack(self.chunk_state.chunk_counter);
+            self.merge_cv_stack();
         }
 
         self
@@ -1230,7 +1248,7 @@ impl Hasher {
         // also. Convert it directly into an Output. Otherwise, we need to
         // merge subtrees below.
         if self.cv_stack.is_empty() {
-            debug_assert_eq!(self.chunk_state.chunk_counter, 0);
+            debug_assert_eq!(self.chunks_in_cv_stack(), 0);
             return self.chunk_state.output();
         }
 
@@ -1251,8 +1269,8 @@ impl Hasher {
         if self.chunk_state.len() > 0 {
             debug_assert_eq!(
                 self.cv_stack.len(),
-                self.chunk_state.chunk_counter.count_ones() as usize,
-                "cv stack does not need a merge"
+                self.chunks_in_cv_stack().count_ones() as usize,
+                "cv stack should not need a merge"
             );
             output = self.chunk_state.output();
         } else {
@@ -1285,6 +1303,10 @@ impl Hasher {
     /// This method is idempotent. Calling it twice will give the same result.
     /// You can also add more input and finalize again.
     pub fn finalize(&self) -> Hash {
+        debug_assert_eq!(
+            0, self.initial_chunk_counter,
+            "the root subtree must start at offset 0",
+        );
         self.final_output().root_hash()
     }
 
@@ -1296,12 +1318,16 @@ impl Hasher {
     ///
     /// [`OutputReader`]: struct.OutputReader.html
     pub fn finalize_xof(&self) -> OutputReader {
+        debug_assert_eq!(
+            0, self.initial_chunk_counter,
+            "the root subtree must start at offset 0",
+        );
         OutputReader::new(self.final_output())
     }
 
     /// Return the total number of bytes hashed so far.
     pub fn count(&self) -> u64 {
-        self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
+        self.chunks_in_cv_stack() * CHUNK_LEN as u64 + self.chunk_state.len() as u64
     }
 }
author	Jack O'Connor <[email protected]>	2022-07-07 11:22:39 -0700
committer	Jack O'Connor <[email protected]>	2022-07-11 15:43:24 -0400
commit	0a721ef5690600bb10608b02cd57ec4051b4c8cf (patch)
tree	df0d16c5dc43850cc7261e337995b9215f1579ef
parent	49a04ca23d10662cc7e5a38fb40f0955224852b5 (diff)