aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2022-07-07 11:22:39 -0700
committerJack O'Connor <[email protected]>2022-07-11 15:43:24 -0400
commit0a721ef5690600bb10608b02cd57ec4051b4c8cf (patch)
treedf0d16c5dc43850cc7261e337995b9215f1579ef
parent49a04ca23d10662cc7e5a38fb40f0955224852b5 (diff)
add more undocumented guts helpers for chunk group hashingmore_guts
-rw-r--r--src/guts.rs120
-rw-r--r--src/lib.rs62
2 files changed, 157 insertions, 25 deletions
diff --git a/src/guts.rs b/src/guts.rs
index ecde326..4f161f1 100644
--- a/src/guts.rs
+++ b/src/guts.rs
@@ -6,6 +6,8 @@
//! We could stabilize something like this module in the future. If you have a
//! use case for it, please let us know by filing a GitHub issue.
+use crate::{Hash, Hasher};
+
pub const BLOCK_LEN: usize = 64;
pub const CHUNK_LEN: usize = 1024;
@@ -35,7 +37,7 @@ impl ChunkState {
self
}
- pub fn finalize(&self, is_root: bool) -> crate::Hash {
+ pub fn finalize(&self, is_root: bool) -> Hash {
let output = self.0.output();
if is_root {
output.root_hash()
@@ -47,11 +49,7 @@ impl ChunkState {
// As above, this currently assumes the regular hash mode. If an incremental
// user needs keyed_hash or derive_key, we can add that.
-pub fn parent_cv(
- left_child: &crate::Hash,
- right_child: &crate::Hash,
- is_root: bool,
-) -> crate::Hash {
+pub fn parent_cv(left_child: &Hash, right_child: &Hash, is_root: bool) -> Hash {
let output = crate::parent_node_output(
left_child.as_bytes(),
right_child.as_bytes(),
@@ -66,6 +64,30 @@ pub fn parent_cv(
}
}
+/// Adjust a regular Hasher so that it can hash a subtree whose starting byte offset is something
+/// other than zero. Morally speaking, this parameter should be in a Hasher constructor function,
+/// but those are public APIs, and I don't want to complicate them while this is still unstable.
+/// This should only be called immediately after a Hasher is constructed, and we do our best to
+/// assert that rule. (Changing this parameter after some input has already been compressed would
+/// lead to a garbage hash.) Subtree hashes that aren't the root must use finalize_nonroot() below,
+/// and we also do our best to assert that.
+pub fn set_offset(hasher: &mut Hasher, starting_offset: u64) {
+ assert_eq!(0, hasher.cv_stack.len());
+ assert_eq!(0, hasher.chunk_state.len());
+ assert_eq!(0, starting_offset % CHUNK_LEN as u64);
+ hasher.initial_chunk_counter = starting_offset / CHUNK_LEN as u64;
+ hasher.chunk_state.chunk_counter = hasher.initial_chunk_counter;
+}
+
+/// Finalize a Hasher as a non-root subtree. Callers using set_offset() above must use this instead
+/// of the regular .finalize() method for any non-root subtrees, or else they'll get garbage hash.
+/// BUT BE CAREFUL: Whatever your subtree size might be, you probably need to account for the case
+/// where you have only one subtree (with offset zero), and in that case it *does* need to be
+/// root-finalized.
+pub fn finalize_nonroot(hasher: &Hasher) -> Hash {
+ Hash(hasher.final_output().chaining_value())
+}
+
#[cfg(test)]
mod test {
use super::*;
@@ -80,7 +102,7 @@ mod test {
#[test]
fn test_parents() {
- let mut hasher = crate::Hasher::new();
+ let mut hasher = Hasher::new();
let mut buf = [0; crate::CHUNK_LEN];
buf[0] = 'a' as u8;
@@ -98,4 +120,88 @@ mod test {
let root = parent_cv(&parent, &chunk2_cv, true);
assert_eq!(hasher.finalize(), root);
}
+
+ #[test]
+ fn test_offset() {
+ let mut input = [0; 12 * CHUNK_LEN + 1];
+ crate::test::paint_test_input(&mut input);
+
+ let mut hasher0 = Hasher::new();
+ set_offset(&mut hasher0, 0 * CHUNK_LEN as u64);
+ hasher0.update(&input[0 * CHUNK_LEN..][..4 * CHUNK_LEN]);
+ let subtree0 = finalize_nonroot(&hasher0);
+
+ let mut hasher1 = Hasher::new();
+ set_offset(&mut hasher1, 4 * CHUNK_LEN as u64);
+ hasher1.update(&input[4 * CHUNK_LEN..][..4 * CHUNK_LEN]);
+ let subtree1 = finalize_nonroot(&hasher1);
+
+ let mut hasher2 = Hasher::new();
+ set_offset(&mut hasher2, 8 * CHUNK_LEN as u64);
+ hasher2.update(&input[8 * CHUNK_LEN..][..4 * CHUNK_LEN]);
+ let subtree2 = finalize_nonroot(&hasher2);
+
+ let mut hasher3 = Hasher::new();
+ set_offset(&mut hasher3, 12 * CHUNK_LEN as u64);
+ hasher3.update(&input[12 * CHUNK_LEN..][..1]);
+ let subtree3 = finalize_nonroot(&hasher3);
+
+ let parent0 = parent_cv(&subtree0, &subtree1, false);
+ let parent1 = parent_cv(&subtree2, &subtree3, false);
+ let root = parent_cv(&parent0, &parent1, true);
+
+ assert_eq!(crate::hash(&input), root);
+ }
+
+ #[test]
+ #[should_panic]
+ fn test_odd_offset() {
+ let mut hasher = Hasher::new();
+ set_offset(&mut hasher, 1);
+ }
+
+ #[test]
+ #[should_panic]
+ fn test_nonempty_offset_short() {
+ let mut hasher = Hasher::new();
+ hasher.update(b"hello");
+ set_offset(&mut hasher, 0);
+ }
+
+ #[test]
+ #[should_panic]
+ fn test_nonempty_offset_long() {
+ let mut hasher = Hasher::new();
+ hasher.update(&[0; 4096]);
+ set_offset(&mut hasher, 0);
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(debug_assertions)]
+ fn test_offset_then_update_too_much() {
+ let mut hasher = Hasher::new();
+ set_offset(&mut hasher, 12 * CHUNK_LEN as u64);
+ hasher.update(&[0; 4 * CHUNK_LEN + 1]);
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(debug_assertions)]
+ fn test_offset_then_root_finalize_xof() {
+ let mut hasher = Hasher::new();
+ set_offset(&mut hasher, 2 * CHUNK_LEN as u64);
+ hasher.update(&[0; 2 * CHUNK_LEN]);
+ hasher.finalize_xof();
+ }
+
+ #[test]
+ #[should_panic]
+ #[cfg(debug_assertions)]
+ fn test_offset_then_root_finalize() {
+ let mut hasher = Hasher::new();
+ set_offset(&mut hasher, 2 * CHUNK_LEN as u64);
+ hasher.update(&[0; 2 * CHUNK_LEN]);
+ hasher.finalize();
+ }
}
diff --git a/src/lib.rs b/src/lib.rs
index 9fc71ae..c3d5b0f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -939,6 +939,8 @@ fn parent_node_output(
pub struct Hasher {
key: CVWords,
chunk_state: ChunkState,
+ // Always zero, unless you use guts::set_offset to change it.
+ initial_chunk_counter: u64,
// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
// requires a 4th entry, rather than merging everything down to 1, because
@@ -952,6 +954,7 @@ impl Hasher {
Self {
key: *key,
chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+ initial_chunk_counter: 0,
cv_stack: ArrayVec::new(),
}
}
@@ -998,6 +1001,10 @@ impl Hasher {
self
}
+ fn chunks_in_cv_stack(&self) -> u64 {
+ self.chunk_state.chunk_counter - self.initial_chunk_counter
+ }
+
// As described in push_cv() below, we do "lazy merging", delaying merges
// until right before the next CV is about to be added. This is different
// from the reference implementation. Another difference is that we aren't
@@ -1009,8 +1016,8 @@ impl Hasher {
// the CV on top of the stack. The principle is the same: each CV that
// should remain in the stack is represented by a 1-bit in the total number
// of chunks (or bytes) so far.
- fn merge_cv_stack(&mut self, total_len: u64) {
- let post_merge_stack_len = total_len.count_ones() as usize;
+ fn merge_cv_stack(&mut self) {
+ let post_merge_stack_len = self.chunks_in_cv_stack().count_ones() as usize;
while self.cv_stack.len() > post_merge_stack_len {
let right_child = self.cv_stack.pop().unwrap();
let left_child = self.cv_stack.pop().unwrap();
@@ -1058,9 +1065,10 @@ impl Hasher {
// merging with each of them separately, so that the second CV will always
// remain unmerged. (That also helps us support extendable output when
// we're hashing an input all-at-once.)
- fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
- self.merge_cv_stack(chunk_counter);
+ fn push_cv(&mut self, new_cv: &CVBytes, num_chunks: u64) {
+ self.merge_cv_stack();
self.cv_stack.push(*new_cv);
+ self.chunk_state.chunk_counter += num_chunks;
}
/// Add input bytes to the hash state. You can call this any number of
@@ -1106,6 +1114,20 @@ impl Hasher {
}
fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
+ // In debug mode, check that this update doesn't exceed the maximum size of the subtree.
+ // This is only relevant when the caller uses guts::set_offset to set a non-zero starting
+ // offset.
+ if self.initial_chunk_counter != 0 {
+ // For example, if the starting chunk counter is 13 (0b1101), then this subtree can
+ // only be a single chunk. But if it's 12 (0b1100), it can be up to four chunks. Note
+ // that this shift would overflow and panic if initial_chunk_counter was zero, and we
+ // rely on the if-condition above to prevent that.
+ let max_subtree_chunks = 1 << self.initial_chunk_counter.trailing_zeros();
+ let max_subtree_len = CHUNK_LEN as u64 * max_subtree_chunks;
+ let new_subtree_len = self.count() + input.len() as u64;
+ debug_assert!(new_subtree_len <= max_subtree_len);
+ }
+
// If we have some partial chunk bytes in the internal chunk_state, we
// need to finish that chunk first.
if self.chunk_state.len() > 0 {
@@ -1119,10 +1141,10 @@ impl Hasher {
// Then we'll proceed to hashing whole chunks below.
debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
let chunk_cv = self.chunk_state.output().chaining_value();
- self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
+ self.push_cv(&chunk_cv, 1);
self.chunk_state = ChunkState::new(
&self.key,
- self.chunk_state.chunk_counter + 1,
+ self.chunk_state.chunk_counter, // incremented in push_cv
self.chunk_state.flags,
self.chunk_state.platform,
);
@@ -1185,7 +1207,7 @@ impl Hasher {
.update(&input[..subtree_len])
.output()
.chaining_value(),
- self.chunk_state.chunk_counter,
+ subtree_chunks,
);
} else {
// This is the high-performance happy path, though getting here
@@ -1202,13 +1224,9 @@ impl Hasher {
// Push the two CVs we received into the CV stack in order. Because
// the stack merges lazily, this guarantees we aren't merging the
// root.
- self.push_cv(left_cv, self.chunk_state.chunk_counter);
- self.push_cv(
- right_cv,
- self.chunk_state.chunk_counter + (subtree_chunks / 2),
- );
+ self.push_cv(left_cv, subtree_chunks / 2);
+ self.push_cv(right_cv, subtree_chunks / 2);
}
- self.chunk_state.chunk_counter += subtree_chunks;
input = &input[subtree_len..];
}
@@ -1219,7 +1237,7 @@ impl Hasher {
// Having added some input to the chunk_state, we know what's in
// the CV stack won't become the root node, and we can do an extra
// merge. This simplifies finalize().
- self.merge_cv_stack(self.chunk_state.chunk_counter);
+ self.merge_cv_stack();
}
self
@@ -1230,7 +1248,7 @@ impl Hasher {
// also. Convert it directly into an Output. Otherwise, we need to
// merge subtrees below.
if self.cv_stack.is_empty() {
- debug_assert_eq!(self.chunk_state.chunk_counter, 0);
+ debug_assert_eq!(self.chunks_in_cv_stack(), 0);
return self.chunk_state.output();
}
@@ -1251,8 +1269,8 @@ impl Hasher {
if self.chunk_state.len() > 0 {
debug_assert_eq!(
self.cv_stack.len(),
- self.chunk_state.chunk_counter.count_ones() as usize,
- "cv stack does not need a merge"
+ self.chunks_in_cv_stack().count_ones() as usize,
+ "cv stack should not need a merge"
);
output = self.chunk_state.output();
} else {
@@ -1285,6 +1303,10 @@ impl Hasher {
/// This method is idempotent. Calling it twice will give the same result.
/// You can also add more input and finalize again.
pub fn finalize(&self) -> Hash {
+ debug_assert_eq!(
+ 0, self.initial_chunk_counter,
+ "the root subtree must start at offset 0",
+ );
self.final_output().root_hash()
}
@@ -1296,12 +1318,16 @@ impl Hasher {
///
/// [`OutputReader`]: struct.OutputReader.html
pub fn finalize_xof(&self) -> OutputReader {
+ debug_assert_eq!(
+ 0, self.initial_chunk_counter,
+ "the root subtree must start at offset 0",
+ );
OutputReader::new(self.final_output())
}
/// Return the total number of bytes hashed so far.
pub fn count(&self) -> u64 {
- self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
+ self.chunks_in_cv_stack() * CHUNK_LEN as u64 + self.chunk_state.len() as u64
}
}