1 files changed, 72 insertions, 44 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 92bdbdb..027d4ba 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -92,13 +92,12 @@
 #[cfg(test)]
 mod test;
 
-// The guts module is for incremental use cases like the `bao` crate that need
-// to explicitly compute chunk and parent chaining values. It is semi-stable
-// and likely to keep working, but largely undocumented and not intended for
-// widespread use.
 #[doc(hidden)]
+#[deprecated(since = "1.8.0", note = "use the hazmat module instead")]
 pub mod guts;
 
+pub mod hazmat;
+
 /// Undocumented and unstable, for benchmarks only.
 #[doc(hidden)]
 pub mod platform;
@@ -155,8 +154,20 @@ pub const OUT_LEN: usize = 32;
 /// The number of bytes in a key, 32.
 pub const KEY_LEN: usize = 32;
 
+/// The number of bytes in a block, 64.
+///
+/// You don't usually need to think about this number. One case where it matters is calling
+/// [`OutputReader::fill`] in a loop, where using a `buf` argument that's a multiple of `BLOCK_LEN`
+/// avoids repeating work.
+pub const BLOCK_LEN: usize = 64;
+
+/// The number of bytes in a chunk, 1024.
+///
+/// You don't usually need to think about this number, but it often comes up in benchmarks, because
+/// the maximum degree of parallelism used by the implementation equals the number of chunks.
+pub const CHUNK_LEN: usize = 1024;
+
 const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
-use guts::{BLOCK_LEN, CHUNK_LEN};
 
 // While iterating the compression function within a chunk, the CV is
 // represented as words, to avoid doing two extra endianness conversions for
@@ -227,7 +238,7 @@ fn counter_high(counter: u64) -> u32 {
 /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html
 /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
-#[derive(Clone, Copy, Hash)]
+#[derive(Clone, Copy, Hash, Eq)]
 pub struct Hash([u8; OUT_LEN]);
 
 impl Hash {
@@ -354,8 +365,6 @@ impl PartialEq<[u8]> for Hash {
     }
 }
 
-impl Eq for Hash {}
-
 impl fmt::Display for Hash {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // Formatting field as `&str` to reduce code size since the `Debug`
@@ -504,7 +513,7 @@ impl ChunkState {
         }
     }
 
-    fn len(&self) -> usize {
+    fn count(&self) -> usize {
         BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize
     }
 
@@ -561,7 +570,7 @@ impl ChunkState {
 
         self.fill_buf(&mut input);
         debug_assert!(input.is_empty());
-        debug_assert!(self.len() <= CHUNK_LEN);
+        debug_assert!(self.count() <= CHUNK_LEN);
         self
     }
 
@@ -582,7 +591,7 @@ impl ChunkState {
 impl fmt::Debug for ChunkState {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         f.debug_struct("ChunkState")
-            .field("len", &self.len())
+            .field("count", &self.count())
             .field("chunk_counter", &self.chunk_counter)
             .field("flags", &self.flags)
             .field("platform", &self.platform)
@@ -646,22 +655,12 @@ impl IncrementCounter {
     }
 }
 
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
+// The largest power of two less than or equal to `n`, used in Hasher::update(). This is similar to
+// left_subtree_len(n), but note that left_subtree_len(n) is strictly less than `n`.
 fn largest_power_of_two_leq(n: usize) -> usize {
     ((n / 2) + 1).next_power_of_two()
 }
 
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-fn left_len(content_len: usize) -> usize {
-    debug_assert!(content_len > CHUNK_LEN);
-    // Subtract 1 to reserve at least one byte for the right side.
-    let full_chunks = (content_len - 1) / CHUNK_LEN;
-    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
 // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
 // on a single thread. Write out the chunk chaining values and return the
 // number of chunks hashed. These chunks are never the root and never empty;
@@ -790,7 +789,7 @@ fn compress_subtree_wide<J: join::Join>(
     // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
     // of 3 or something, we'll need a more complicated strategy.)
     debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
-    let (left, right) = input.split_at(left_len(input.len()));
+    let (left, right) = input.split_at(hazmat::left_subtree_len(input.len() as u64) as usize);
     let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
 
     // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
@@ -996,10 +995,8 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
 pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
-    let context_key =
-        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
-            .root_hash();
-    let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
+    let context_key = hazmat::hash_derive_key_context(context);
+    let context_key_words = platform::words_from_le_bytes_32(&context_key);
     hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
         .root_hash()
         .0
@@ -1063,6 +1060,7 @@ fn parent_node_output(
 pub struct Hasher {
     key: CVWords,
     chunk_state: ChunkState,
+    initial_chunk_counter: u64,
     // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
     // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
     // requires a 4th entry, rather than merging everything down to 1, because
@@ -1076,6 +1074,7 @@ impl Hasher {
         Self {
             key: *key,
             chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+            initial_chunk_counter: 0,
             cv_stack: ArrayVec::new(),
         }
     }
@@ -1100,10 +1099,8 @@ impl Hasher {
     ///
     /// [`derive_key`]: fn.derive_key.html
     pub fn new_derive_key(context: &str) -> Self {
-        let context_key =
-            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
-                .root_hash();
-        let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
+        let context_key = hazmat::hash_derive_key_context(context);
+        let context_key_words = platform::words_from_le_bytes_32(&context_key);
         Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
     }
 
@@ -1133,8 +1130,12 @@ impl Hasher {
     // the CV on top of the stack. The principle is the same: each CV that
     // should remain in the stack is represented by a 1-bit in the total number
     // of chunks (or bytes) so far.
-    fn merge_cv_stack(&mut self, total_len: u64) {
-        let post_merge_stack_len = total_len.count_ones() as usize;
+    fn merge_cv_stack(&mut self, chunk_counter: u64) {
+        // Account for non-zero cases of Hasher::set_input_offset, where there are no prior
+        // subtrees in the stack. Note that initial_chunk_counter is always 0 for callers who don't
+        // use the hazmat module.
+        let post_merge_stack_len =
+            (chunk_counter - self.initial_chunk_counter).count_ones() as usize;
         while self.cv_stack.len() > post_merge_stack_len {
             let right_child = self.cv_stack.pop().unwrap();
             let left_child = self.cv_stack.pop().unwrap();
@@ -1199,10 +1200,21 @@ impl Hasher {
     }
 
     fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
+        let input_offset = self.initial_chunk_counter * CHUNK_LEN as u64;
+        if let Some(max) = hazmat::max_subtree_len(input_offset) {
+            let remaining = max - self.count();
+            assert!(
+                input.len() as u64 <= remaining,
+                "the subtree starting at {} contains at most {} bytes (found {})",
+                CHUNK_LEN as u64 * self.initial_chunk_counter,
+                max,
+                input.len(),
+            );
+        }
         // If we have some partial chunk bytes in the internal chunk_state, we
         // need to finish that chunk first.
-        if self.chunk_state.len() > 0 {
-            let want = CHUNK_LEN - self.chunk_state.len();
+        if self.chunk_state.count() > 0 {
+            let want = CHUNK_LEN - self.chunk_state.count();
             let take = cmp::min(want, input.len());
             self.chunk_state.update(&input[..take]);
             input = &input[take..];
@@ -1210,7 +1222,7 @@ impl Hasher {
                 // We've filled the current chunk, and there's more input
                 // coming, so we know it's not the root and we can finalize it.
                 // Then we'll proceed to hashing whole chunks below.
-                debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
+                debug_assert_eq!(self.chunk_state.count(), CHUNK_LEN);
                 let chunk_cv = self.chunk_state.output().chaining_value();
                 self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
                 self.chunk_state = ChunkState::new(
@@ -1238,7 +1250,7 @@ impl Hasher {
         // Because we might need to break up the input to form powers of 2, or
         // to evenly divide what we already have, this part runs in a loop.
         while input.len() > CHUNK_LEN {
-            debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
+            debug_assert_eq!(self.chunk_state.count(), 0, "no partial chunk data");
             debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
             let mut subtree_len = largest_power_of_two_leq(input.len());
             let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
@@ -1323,7 +1335,7 @@ impl Hasher {
         // also. Convert it directly into an Output. Otherwise, we need to
         // merge subtrees below.
         if self.cv_stack.is_empty() {
-            debug_assert_eq!(self.chunk_state.chunk_counter, 0);
+            debug_assert_eq!(self.chunk_state.chunk_counter, self.initial_chunk_counter);
             return self.chunk_state.output();
         }
 
@@ -1341,11 +1353,11 @@ impl Hasher {
         // the empty chunk is taken care of above.
         let mut output: Output;
         let mut num_cvs_remaining = self.cv_stack.len();
-        if self.chunk_state.len() > 0 {
+        if self.chunk_state.count() > 0 {
             debug_assert_eq!(
                 self.cv_stack.len(),
-                self.chunk_state.chunk_counter.count_ones() as usize,
-                "cv stack does not need a merge"
+                (self.chunk_state.chunk_counter - self.initial_chunk_counter).count_ones() as usize,
+                "cv stack does not need a merge",
             );
             output = self.chunk_state.output();
         } else {
@@ -1378,6 +1390,10 @@ impl Hasher {
     /// This method is idempotent. Calling it twice will give the same result.
     /// You can also add more input and finalize again.
     pub fn finalize(&self) -> Hash {
+        assert_eq!(
+            self.initial_chunk_counter, 0,
+            "set_input_offset must be used with finalize_non_root",
+        );
         self.final_output().root_hash()
     }
 
@@ -1389,12 +1405,22 @@ impl Hasher {
     ///
     /// [`OutputReader`]: struct.OutputReader.html
     pub fn finalize_xof(&self) -> OutputReader {
+        assert_eq!(
+            self.initial_chunk_counter, 0,
+            "set_input_offset must be used with finalize_non_root",
+        );
         OutputReader::new(self.final_output())
     }
 
     /// Return the total number of bytes hashed so far.
+    ///
+    /// [`hazmat::HasherExt::set_input_offset`] does not affect this value. This only counts bytes
+    /// passed to [`update`](Hasher::update).
     pub fn count(&self) -> u64 {
-        self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
+        // Account for non-zero cases of Hasher::set_input_offset. Note that initial_chunk_counter
+        // is always 0 for callers who don't use the hazmat module.
+        (self.chunk_state.chunk_counter - self.initial_chunk_counter) * CHUNK_LEN as u64
+            + self.chunk_state.count() as u64
     }
 
     /// As [`update`](Hasher::update), but reading from a
@@ -1613,11 +1639,13 @@ impl Zeroize for Hasher {
         let Self {
             key,
             chunk_state,
+            initial_chunk_counter,
             cv_stack,
         } = self;
 
         key.zeroize();
         chunk_state.zeroize();
+        initial_chunk_counter.zeroize();
         cv_stack.zeroize();
     }
 }
@@ -1684,7 +1712,7 @@ impl OutputReader {
     /// calling `fill` repeatedly with a short-length or odd-length slice will
     /// end up performing the same compression multiple times. If you're
     /// reading output in a loop, prefer a slice length that's a multiple of
-    /// 64.
+    /// [`BLOCK_LEN`] (64 bytes).
     ///
     /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try
     /// to extract more than that, for example by seeking near the end and