From e1c2ea27fdd717fd924d7b286a125408d7e817f7 Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Sun, 30 Mar 2025 17:25:54 -0700
Subject: add the `hazmat` module and deprecate the undocumented `guts` module

https://github.com/BLAKE3-team/BLAKE3/pull/458
---
 b3sum/src/main.rs       |   2 +-
 benches/bench.rs        |   2 +-
 c/blake3.c              |   8 +-
 src/guts.rs             |  47 +---
 src/hazmat.rs           | 704 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs              | 116 +++++---
 src/test.rs             |  63 +++--
 test_vectors/src/lib.rs |   2 +-
 8 files changed, 831 insertions(+), 113 deletions(-)
 create mode 100644 src/hazmat.rs

diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index 4479d4d..69a10c8 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -196,7 +196,7 @@ fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> anyhow::Re
     // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
     // We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF.
     let mut len = args.len();
-    let mut block = [0; blake3::guts::BLOCK_LEN];
+    let mut block = [0; blake3::BLOCK_LEN];
     while len > 0 {
         output.fill(&mut block);
         let hex_str = hex::encode(&block[..]);
diff --git a/benches/bench.rs b/benches/bench.rs
index defdb12..e89f04c 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -4,9 +4,9 @@ extern crate test;
 
 use arrayref::array_ref;
 use arrayvec::ArrayVec;
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
 use blake3::platform::{Platform, MAX_SIMD_DEGREE};
 use blake3::OUT_LEN;
+use blake3::{BLOCK_LEN, CHUNK_LEN};
 use rand::prelude::*;
 use test::Bencher;
 
diff --git a/c/blake3.c b/c/blake3.c
index f21973e..74fc2e7 100644
--- a/c/blake3.c
+++ b/c/blake3.c
@@ -158,10 +158,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
 // Given some input larger than one chunk, return the number of bytes that
 // should go in the left subtree. This is the largest power-of-2 number of
 // chunks that leaves at least 1 byte for the right subtree.
-INLINE size_t left_len(size_t content_len) {
-  // Subtract 1 to reserve at least one byte for the right side. content_len
+INLINE size_t left_subtree_len(size_t input_len) {
+  // Subtract 1 to reserve at least one byte for the right side. input_len
   // should always be greater than BLAKE3_CHUNK_LEN.
-  size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+  size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
   return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
 }
 
@@ -282,7 +282,7 @@ size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
   // the input into left and right subtrees. (Note that this is only optimal
   // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
   // of 3 or something, we'll need a more complicated strategy.)
-  size_t left_input_len = left_len(input_len);
+  size_t left_input_len = left_subtree_len(input_len);
   size_t right_input_len = input_len - left_input_len;
   const uint8_t *right_input = &input[left_input_len];
   uint64_t right_chunk_counter =
diff --git a/src/guts.rs b/src/guts.rs
index ecde326..6bbf5a5 100644
--- a/src/guts.rs
+++ b/src/guts.rs
@@ -1,13 +1,6 @@
-//! This undocumented and unstable module is for use cases like the `bao` crate,
-//! which need to traverse the BLAKE3 Merkle tree and work with chunk and parent
-//! chaining values directly. There might be breaking changes to this module
-//! between patch versions.
-//!
-//! We could stabilize something like this module in the future. If you have a
-//! use case for it, please let us know by filing a GitHub issue.
+//! Deprecated in favor of [`hazmat`](crate::hazmat)
 
-pub const BLOCK_LEN: usize = 64;
-pub const CHUNK_LEN: usize = 1024;
+pub use crate::{BLOCK_LEN, CHUNK_LEN};
 
 #[derive(Clone, Debug)]
 pub struct ChunkState(crate::ChunkState);
@@ -26,7 +19,7 @@ impl ChunkState {
 
     #[inline]
     pub fn len(&self) -> usize {
-        self.0.len()
+        self.0.count()
     }
 
     #[inline]
@@ -65,37 +58,3 @@ pub fn parent_cv(
         output.chaining_value().into()
     }
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn test_chunk() {
-        assert_eq!(
-            crate::hash(b"foo"),
-            ChunkState::new(0).update(b"foo").finalize(true)
-        );
-    }
-
-    #[test]
-    fn test_parents() {
-        let mut hasher = crate::Hasher::new();
-        let mut buf = [0; crate::CHUNK_LEN];
-
-        buf[0] = 'a' as u8;
-        hasher.update(&buf);
-        let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false);
-
-        buf[0] = 'b' as u8;
-        hasher.update(&buf);
-        let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false);
-
-        hasher.update(b"c");
-        let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false);
-
-        let parent = parent_cv(&chunk0_cv, &chunk1_cv, false);
-        let root = parent_cv(&parent, &chunk2_cv, true);
-        assert_eq!(hasher.finalize(), root);
-    }
-}
diff --git a/src/hazmat.rs b/src/hazmat.rs
new file mode 100644
index 0000000..2fd2449
--- /dev/null
+++ b/src/hazmat.rs
@@ -0,0 +1,704 @@
+//! Low-level tree manipulations and other sharp tools
+//!
+//! The target audience for this module is projects like [Bao](https://github.com/oconnor663/bao),
+//! which work directly with the interior hashes ("chaining values") of BLAKE3 chunks and subtrees.
+//! For example, you could use these functions to implement a BitTorrent-like protocol using the
+//! BLAKE3 tree structure, or to hash an input that's distributed across different machines. These
+//! use cases are advanced, and most applications don't need this module. Also:
+//!
+//! <div class="warning">
+//!
+//! **Warning:** This module is *hazardous material*. If you've heard folks say *don't roll your
+//! own crypto,* this is the sort of thing they're talking about. These functions have complicated
+//! requirements, and any mistakes will give you garbage output and/or break the security
+//! properties that BLAKE3 is supposed to have. Read section 2.1 of [the BLAKE3
+//! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) to understand the
+//! tree structure you need to maintain. Test your code against [`blake3::hash`](../fn.hash.html)
+//! and make sure you can get the same outputs for [lots of different
+//! inputs](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json).
+//!
+//! </div>
+//!
+//! On the other hand:
+//!
+//! <div class="warning">
+//!
+//! **Encouragement:** Playing with these functions is a great way to learn how BLAKE3 works on the
+//! inside. Have fun!
+//!
+//! </div>
+//!
+//! The main entrypoint for this module is the [`HasherExt`] trait, particularly the
+//! [`set_input_offset`](HasherExt::set_input_offset) and
+//! [`finalize_non_root`](HasherExt::finalize_non_root) methods. These let you compute the chaining
+//! values of individual chunks or subtrees. You then combine these chaining values into larger
+//! subtrees using [`merge_subtrees_non_root`] and finally (once at the very top)
+//! [`merge_subtrees_root`] or [`merge_subtrees_root_xof`].
+//!
+//! # Examples
+//!
+//! Here's an example of computing all the interior hashes in a 3-chunk tree:
+//!
+//! ```text
+//!            root
+//!          /      \
+//!      parent      \
+//!    /       \      \
+//! chunk0  chunk1  chunk2
+//! ```
+//!
+//! ```
+//! # fn main() {
+//! use blake3::{Hasher, CHUNK_LEN};
+//! use blake3::hazmat::{merge_subtrees_non_root, merge_subtrees_root, Mode};
+//! use blake3::hazmat::HasherExt; // an extension trait for Hasher
+//!
+//! let chunk0 = [b'a'; CHUNK_LEN];
+//! let chunk1 = [b'b'; CHUNK_LEN];
+//! let chunk2 = [b'c'; 42]; // The final chunk can be short.
+//!
+//! // Compute the non-root hashes ("chaining values") of all three chunks. Chunks or subtrees
+//! // that don't begin at the start of the input use `set_input_offset` to say where they begin.
+//! let chunk0_cv = Hasher::new()
+//!     // .set_input_offset(0) is the default.
+//!     .update(&chunk0)
+//!     .finalize_non_root();
+//! let chunk1_cv = Hasher::new()
+//!     .set_input_offset(CHUNK_LEN as u64)
+//!     .update(&chunk1)
+//!     .finalize_non_root();
+//! let chunk2_cv = Hasher::new()
+//!     .set_input_offset(2 * CHUNK_LEN as u64)
+//!     .update(&chunk2)
+//!     .finalize_non_root();
+//!
+//! // Join the first two chunks with a non-root parent node and compute its chaining value.
+//! let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash);
+//!
+//! // Join that parent node and the third chunk with a root parent node and compute the hash.
+//! let root_hash = merge_subtrees_root(&parent_cv, &chunk2_cv, Mode::Hash);
+//!
+//! // Double check that we got the right answer.
+//! let mut combined_input = Vec::new();
+//! combined_input.extend_from_slice(&chunk0);
+//! combined_input.extend_from_slice(&chunk1);
+//! combined_input.extend_from_slice(&chunk2);
+//! assert_eq!(root_hash, blake3::hash(&combined_input));
+//! # }
+//! ```
+//!
+//! Hashing many chunks together is important for performance, because it allows the implementation
+//! to use SIMD parallelism internally. ([AVX-512](https://en.wikipedia.org/wiki/AVX-512) for
+//! example needs 16 chunks to really get going.) We can reproduce `parent_cv` by hashing `chunk0`
+//! and `chunk1` at the same time:
+//!
+//! ```
+//! # fn main() {
+//! # use blake3::{Hasher, CHUNK_LEN};
+//! # use blake3::hazmat::{Mode, HasherExt, merge_subtrees_non_root, merge_subtrees_root};
+//! # let chunk0 = [b'a'; CHUNK_LEN];
+//! # let chunk1 = [b'b'; CHUNK_LEN];
+//! # let chunk0_cv = Hasher::new().update(&chunk0).finalize_non_root();
+//! # let chunk1_cv = Hasher::new().set_input_offset(CHUNK_LEN as u64).update(&chunk1).finalize_non_root();
+//! # let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash);
+//! # let mut combined_input = Vec::new();
+//! # combined_input.extend_from_slice(&chunk0);
+//! # combined_input.extend_from_slice(&chunk1);
+//! let left_subtree_cv = Hasher::new()
+//!     // .set_input_offset(0) is the default.
+//!     .update(&combined_input[..2 * CHUNK_LEN])
+//!     .finalize_non_root();
+//! assert_eq!(left_subtree_cv, parent_cv);
+//!
+//! // Using multiple updates gives the same answer, though it's not as efficient.
+//! let mut subtree_hasher = Hasher::new();
+//! // Again, .set_input_offset(0) is the default.
+//! subtree_hasher.update(&chunk0);
+//! subtree_hasher.update(&chunk1);
+//! assert_eq!(left_subtree_cv, subtree_hasher.finalize_non_root());
+//! # }
+//! ```
+//!
+//! However, hashing multiple chunks together **must** respect the overall tree structure. Hashing
+//! `chunk0` and `chunk1` together is valid, but hashing `chunk1` and `chunk2` together is
+//! incorrect and gives a garbage result that will never match a standard BLAKE3 hash. The
+//! implementation includes a few best-effort asserts to catch some of these mistakes, but these
+//! checks aren't guaranteed. For example, this second call to `update` currently panics:
+//!
+//! ```should_panic
+//! # fn main() {
+//! # use blake3::{Hasher, CHUNK_LEN};
+//! # use blake3::hazmat::HasherExt;
+//! # let chunk0 = [b'a'; CHUNK_LEN];
+//! # let chunk1 = [b'b'; CHUNK_LEN];
+//! # let chunk2 = [b'c'; 42];
+//! let oops = Hasher::new()
+//!     .set_input_offset(CHUNK_LEN as u64)
+//!     .update(&chunk1)
+//!     // PANIC: "the subtree starting at 1024 contains at most 1024 bytes"
+//!     .update(&chunk2)
+//!     .finalize_non_root();
+//! # }
+//! ```
+//!
+//! For more on valid tree structures, see the docs for and [`left_subtree_len`] and
+//! [`max_subtree_len`], and see section 2.1 of [the BLAKE3
+//! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that the
+//! merging functions ([`merge_subtrees_root`] and friends) don't know the shape of the left and
+//! right subtrees you're giving them, and they can't help you catch mistakes. The best way to
+//! catch mistakes with these is to compare your root output to the [`blake3::hash`](crate::hash)
+//! of the same input.
+
+use crate::platform::Platform;
+use crate::{CVWords, Hasher, CHUNK_LEN, IV, KEY_LEN, OUT_LEN};
+
+/// Extension methods for [`Hasher`]. This is the main entrypoint to the `hazmat` module.
+pub trait HasherExt {
+    /// Similar to [`Hasher::new_derive_key`] but using a pre-hashed [`ContextKey`] from
+    /// [`hash_derive_key_context`].
+    ///
+    /// The [`hash_derive_key_context`] function is _only_ valid source of the [`ContextKey`]
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use blake3::Hasher;
+    /// use blake3::hazmat::HasherExt;
+    ///
+    /// let context_key = blake3::hazmat::hash_derive_key_context("foo");
+    /// let mut hasher = Hasher::new_from_context_key(&context_key);
+    /// hasher.update(b"bar");
+    /// let derived_key = *hasher.finalize().as_bytes();
+    ///
+    /// assert_eq!(derived_key, blake3::derive_key("foo", b"bar"));
+    /// ```
+    fn new_from_context_key(context_key: &ContextKey) -> Self;
+
+    /// Configure the `Hasher` to process a chunk or subtree starting at `offset` bytes into the
+    /// whole input.
+    ///
+    /// You must call this function before processing any input with [`update`](Hasher::update) or
+    /// similar. This step isn't required for the first chunk, or for a subtree that includes the
+    /// first chunk (i.e. when the `offset` is zero), but it's required for all other chunks and
+    /// subtrees.
+    ///
+    /// The starting input offset of a subtree implies a maximum possible length for that subtree.
+    /// See [`max_subtree_len`] and section 2.1 of [the BLAKE3
+    /// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that only
+    /// subtrees along the right edge of the whole tree can have a length less than their maximum
+    /// possible length.
+    ///
+    /// See the [module level examples](index.html#examples).
+    ///
+    /// # Panics
+    ///
+    /// This function panics if the `Hasher` has already accepted any input with
+    /// [`update`](Hasher::update) or similar.
+    ///
+    /// This should always be paired with [`finalize_non_root`](HasherExt::finalize_non_root). It's
+    /// never correct to use a non-zero input offset with [`finalize`](Hasher::finalize) or
+    /// [`finalize_xof`](Hasher::finalize_xof). The `offset` must also be a multiple of
+    /// `CHUNK_LEN`. Violating either of these rules will currently fail an assertion and panic,
+    /// but this is not guaranteed.
+    fn set_input_offset(&mut self, offset: u64) -> &mut Self;
+
+    /// Finalize the non-root hash ("chaining value") of the current chunk or subtree.
+    ///
+    /// Afterwards you can merge subtree chaining values into parent nodes using
+    /// [`merge_subtrees_non_root`] and ultimately into the root node with either
+    /// [`merge_subtrees_root`] (similar to [`Hasher::finalize`]) or [`merge_subtrees_root_xof`]
+    /// (similar to [`Hasher::finalize_xof`]).
+    ///
+    /// See the [module level examples](index.html#examples), particularly the discussion of valid
+    /// tree structures.
+    fn finalize_non_root(&self) -> ChainingValue;
+}
+
+impl HasherExt for Hasher {
+    fn new_from_context_key(context_key: &[u8; KEY_LEN]) -> Hasher {
+        let context_key_words = crate::platform::words_from_le_bytes_32(context_key);
+        Hasher::new_internal(&context_key_words, crate::DERIVE_KEY_MATERIAL)
+    }
+
+    fn set_input_offset(&mut self, offset: u64) -> &mut Hasher {
+        assert_eq!(self.count(), 0, "hasher has already accepted input");
+        assert_eq!(
+            offset % CHUNK_LEN as u64,
+            0,
+            "offset ({offset}) must be a chunk boundary (divisible by {CHUNK_LEN})",
+        );
+        let counter = offset / CHUNK_LEN as u64;
+        self.chunk_state.chunk_counter = counter;
+        self.initial_chunk_counter = counter;
+        self
+    }
+
+    fn finalize_non_root(&self) -> ChainingValue {
+        assert_ne!(self.count(), 0, "empty subtrees are never valid");
+        self.final_output().chaining_value()
+    }
+}
+
+/// The maximum length of a subtree in bytes, given its starting offset in bytes
+///
+/// If you try to hash more than this many bytes as one subtree, you'll end up merging parent nodes
+/// that shouldn't be merged, and your output will be garbage. [`Hasher::update`] will currently
+/// panic in this case, but this is not guaranteed.
+///
+/// For input offset zero (the default), there is no maximum length, and this function returns
+/// `None`. For all other offsets it returns `Some`. Note that valid offsets must be a multiple of
+/// [`CHUNK_LEN`] (1024); it's not possible to start hashing a chunk in the middle.
+///
+/// In the example tree below, chunks are numbered by their _0-based index_. The subtree that
+/// _starts_ with chunk 3, i.e. `input_offset = 3 * CHUNK_LEN`, includes only that one chunk, so
+/// its max length is `Some(CHUNK_LEN)`. The subtree that starts with chunk 6 includes chunk 7 but
+/// not chunk 8, so its max length is `Some(2 * CHUNK_LEN)`. The subtree that starts with chunk 12
+/// includes chunks 13, 14, and 15, but if the tree were bigger it would not include chunk 16, so
+/// its max length is `Some(4 * CHUNK_LEN)`. One way to think about the rule here is that, if you
+/// go beyond the max subtree length from a given starting offset, you start dealing with subtrees
+/// that include chunks _to the left_ of where you started.
+///
+/// ```text
+///                           root
+///                 /                       \
+///              .                             .
+///        /           \                 /           \
+///       .             .               .             .
+///    /    \         /    \         /    \         /    \
+///   .      .       .      .       .      .       .      .
+///  / \    / \     / \    / \     / \    / \     / \    / \
+/// 0  1   2  3    4  5   6  7    8  9   10 11   12 13  14 15
+/// ```
+///
+/// The general rule turns out to be that for a subtree starting at a 0-based chunk index N greater
+/// than zero, the maximum number of chunks in that subtree is the largest power-of-two that
+/// divides N, which is given by `1 << N.trailing_zeros()`.
+///
+/// This function can be useful for writing tests or debug assertions, but it's actually rare to
+/// use this for real control flow. Callers who split their input recursively using
+/// [`left_subtree_len`] will automatically satisfy the `max_subtree_len` bound and don't
+/// necessarily need to check. It's also common to choose some fixed power-of-two subtree size, say
+/// 64 chunks, and divide your input up into slices of that fixed length (with the final slice
+/// possibly short). This approach also automatically satisfies the `max_subtree_len` bound and
+/// doesn't need to check. Proving that this is true can be an interesting exercise. Note that
+/// chunks 0, 4, 8, and 12 all begin subtrees of at least 4 chunks in the example tree above.
+///
+/// # Panics
+///
+/// This function currently panics if `input_offset` is not a multiple of `CHUNK_LEN`. This is not
+/// guaranteed.
+#[inline(always)]
+pub fn max_subtree_len(input_offset: u64) -> Option<u64> {
+    if input_offset == 0 {
+        return None;
+    }
+    assert_eq!(input_offset % CHUNK_LEN as u64, 0);
+    let counter = input_offset / CHUNK_LEN as u64;
+    let max_chunks = 1 << counter.trailing_zeros();
+    Some(max_chunks * CHUNK_LEN as u64)
+}
+
+#[test]
+fn test_max_subtree_len() {
+    assert_eq!(max_subtree_len(0), None);
+    // (chunk index, max chunks)
+    let cases = [
+        (1, 1),
+        (2, 2),
+        (3, 1),
+        (4, 4),
+        (5, 1),
+        (6, 2),
+        (7, 1),
+        (8, 8),
+    ];
+    for (chunk_index, max_chunks) in cases {
+        let input_offset = chunk_index * CHUNK_LEN as u64;
+        assert_eq!(
+            max_subtree_len(input_offset),
+            Some(max_chunks * CHUNK_LEN as u64),
+        );
+    }
+}
+
+/// Given the length in bytes of either a complete input or a subtree input, return the number of
+/// bytes that belong to its left child subtree. The rest belong to its right child subtree.
+///
+/// Concretely, this function returns the largest power-of-two number of bytes that's strictly less
+/// than `input_len`. This leads to a tree where all left subtrees are "complete" and at least as
+/// large as their sibling right subtrees, as specified in section 2.1 of [the BLAKE3
+/// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). For example, if an
+/// input is exactly two chunks, its left and right subtrees both get one chunk. But if an input is
+/// two chunks plus one more byte, then its left subtree gets two chunks, and its right subtree
+/// only gets one byte.
+///
+/// This function isn't meaningful for one chunk of input, because chunks don't have children. It
+/// currently panics in debug mode if `input_len <= CHUNK_LEN`.
+///
+/// # Example
+///
+/// Hash a input of random length as two subtrees:
+///
+/// ```
+/// # #[cfg(feature = "std")] {
+/// use blake3::hazmat::{left_subtree_len, merge_subtrees_root, HasherExt, Mode};
+/// use blake3::{Hasher, CHUNK_LEN};
+///
+/// // Generate a random-length input. Note that to be split into two subtrees, the input length
+/// // must be greater than CHUNK_LEN.
+/// let input_len = rand::random_range(CHUNK_LEN + 1..1_000_000);
+/// let mut input = vec![0; input_len];
+/// rand::fill(&mut input[..]);
+///
+/// // Compute the left and right subtree hashes and then the root hash. left_subtree_len() tells
+/// // us exactly where to split the input. Any other split would either panic (if we're lucky) or
+/// // lead to an incorrect root hash.
+/// let left_len = left_subtree_len(input_len as u64) as usize;
+/// let left_subtree_cv = Hasher::new()
+///     .update(&input[..left_len])
+///     .finalize_non_root();
+/// let right_subtree_cv = Hasher::new()
+///     .set_input_offset(left_len as u64)
+///     .update(&input[left_len..])
+///     .finalize_non_root();
+/// let root_hash = merge_subtrees_root(&left_subtree_cv, &right_subtree_cv, Mode::Hash);
+///
+/// // Double check the answer.
+/// assert_eq!(root_hash, blake3::hash(&input));
+/// # }
+/// ```
+#[inline(always)]
+pub fn left_subtree_len(input_len: u64) -> u64 {
+    debug_assert!(input_len > CHUNK_LEN as u64);
+    // Note that .next_power_of_two() is greater than *or equal*.
+    ((input_len + 1) / 2).next_power_of_two()
+}
+
+#[test]
+fn test_left_subtree_len() {
+    assert_eq!(left_subtree_len(1025), 1024);
+    for boundary_case in [2, 4, 8, 16, 32, 64] {
+        let input_len = boundary_case * CHUNK_LEN as u64;
+        assert_eq!(left_subtree_len(input_len - 1), input_len / 2);
+        assert_eq!(left_subtree_len(input_len), input_len / 2);
+        assert_eq!(left_subtree_len(input_len + 1), input_len);
+    }
+}
+
+/// The `mode` argument to [`merge_subtrees_root`] and friends
+///
+/// See the [module level examples](index.html#examples).
+#[derive(Copy, Clone, Debug)]
+pub enum Mode<'a> {
+    /// Corresponding to [`hash`](crate::hash)
+    Hash,
+
+    /// Corresponding to [`keyed_hash`](crate::hash)
+    KeyedHash(&'a [u8; KEY_LEN]),
+
+    /// Corresponding to [`derive_key`](crate::hash)
+    ///
+    /// The [`ContextKey`] comes from [`hash_derive_key_context`].
+    DeriveKeyMaterial(&'a ContextKey),
+}
+
+impl<'a> Mode<'a> {
+    fn key_words(&self) -> CVWords {
+        match self {
+            Mode::Hash => *IV,
+            Mode::KeyedHash(key) => crate::platform::words_from_le_bytes_32(key),
+            Mode::DeriveKeyMaterial(cx_key) => crate::platform::words_from_le_bytes_32(cx_key),
+        }
+    }
+
+    fn flags_byte(&self) -> u8 {
+        match self {
+            Mode::Hash => 0,
+            Mode::KeyedHash(_) => crate::KEYED_HASH,
+            Mode::DeriveKeyMaterial(_) => crate::DERIVE_KEY_MATERIAL,
+        }
+    }
+}
+
+/// "Chaining value" is the academic term for a non-root or non-final hash.
+///
+/// Besides just sounding fancy, it turns out there are [security
+/// reasons](https://jacko.io/tree_hashing.html) to be careful about the difference between
+/// (root/final) hashes and (non-root/non-final) chaining values.
+pub type ChainingValue = [u8; OUT_LEN];
+
+fn merge_subtrees_inner(
+    left_child: &ChainingValue,
+    right_child: &ChainingValue,
+    mode: Mode,
+) -> crate::Output {
+    crate::parent_node_output(
+        &left_child,
+        &right_child,
+        &mode.key_words(),
+        mode.flags_byte(),
+        Platform::detect(),
+    )
+}
+
+/// Compute a non-root parent node chaining value from two child chaining values.
+///
+/// See the [module level examples](index.html#examples), particularly the discussion of valid tree
+/// structures. The left and right child chaining values can come from either
+/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or other calls to
+/// `merge_subtrees_non_root`. "Chaining value" is the academic term for a non-root or non-final
+/// hash.
+pub fn merge_subtrees_non_root(
+    left_child: &ChainingValue,
+    right_child: &ChainingValue,
+    mode: Mode,
+) -> ChainingValue {
+    merge_subtrees_inner(left_child, right_child, mode).chaining_value()
+}
+
+/// Compute a root hash from two child chaining values.
+///
+/// See the [module level examples](index.html#examples), particularly the discussion of valid tree
+/// structures. The left and right child chaining values can come from either
+/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`].
+/// "Chaining value" is the academic term for a non-root or non-final hash.
+///
+/// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed
+/// using this function. In that case you must get the root hash from [`Hasher::finalize`] (or just
+/// [`blake3::hash`](crate::hash)).
+pub fn merge_subtrees_root(
+    left_child: &ChainingValue,
+    right_child: &ChainingValue,
+    mode: Mode,
+) -> crate::Hash {
+    merge_subtrees_inner(left_child, right_child, mode).root_hash()
+}
+
+/// Build a root [`OutputReader`](crate::OutputReader) from two child chaining values.
+///
+/// See also the [module level examples](index.html#examples), particularly the discussion of valid
+/// tree structures. The left and right child chaining values can come from either
+/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`].
+/// "Chaining value" is the academic term for a non-root or non-final hash.
+///
+/// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed
+/// using this function. In that case you must get the `OutputReader` from
+/// [`Hasher::finalize_xof`].
+///
+/// # Example
+///
+/// ```
+/// use blake3::hazmat::{merge_subtrees_root_xof, HasherExt, Mode};
+/// use blake3::{Hasher, CHUNK_LEN};
+///
+/// // Hash a 2-chunk subtree in steps. Note that only
+/// // the final chunk can be shorter than CHUNK_LEN.
+/// let chunk0 = &[42; CHUNK_LEN];
+/// let chunk1 = b"hello world";
+/// let chunk0_cv = Hasher::new()
+///     .update(chunk0)
+///     .finalize_non_root();
+/// let chunk1_cv = Hasher::new()
+///     .set_input_offset(CHUNK_LEN as u64)
+///     .update(chunk1)
+///     .finalize_non_root();
+///
+/// // Obtain a blake3::OutputReader at the root and extract 1000 bytes.
+/// let mut output_reader = merge_subtrees_root_xof(&chunk0_cv, &chunk1_cv, Mode::Hash);
+/// let mut output_bytes = [0; 1_000];
+/// output_reader.fill(&mut output_bytes);
+///
+/// // Double check the answer.
+/// let mut hasher = Hasher::new();
+/// hasher.update(chunk0);
+/// hasher.update(chunk1);
+/// let mut expected = [0; 1_000];
+/// hasher.finalize_xof().fill(&mut expected);
+/// assert_eq!(output_bytes, expected);
+/// ```
+pub fn merge_subtrees_root_xof(
+    left_child: &ChainingValue,
+    right_child: &ChainingValue,
+    mode: Mode,
+) -> crate::OutputReader {
+    crate::OutputReader::new(merge_subtrees_inner(left_child, right_child, mode))
+}
+
+/// An alias to distinguish [`hash_derive_key_context`] outputs from other keys.
+pub type ContextKey = [u8; KEY_LEN];
+
+/// Hash a [`derive_key`](crate::derive_key) context string and return a [`ContextKey`].
+///
+/// The _only_ valid uses for the returned [`ContextKey`] are [`Hasher::new_from_context_key`] and
+/// [`Mode::DeriveKeyMaterial`] (together with the merge subtree functions).
+///
+/// # Example
+///
+/// ```
+/// use blake3::Hasher;
+/// use blake3::hazmat::HasherExt;
+///
+/// let context_key = blake3::hazmat::hash_derive_key_context("foo");
+/// let mut hasher = Hasher::new_from_context_key(&context_key);
+/// hasher.update(b"bar");
+/// let derived_key = *hasher.finalize().as_bytes();
+///
+/// assert_eq!(derived_key, blake3::derive_key("foo", b"bar"));
+/// ```
+pub fn hash_derive_key_context(context: &str) -> ContextKey {
+    crate::hash_all_at_once::<crate::join::SerialJoin>(
+        context.as_bytes(),
+        IV,
+        crate::DERIVE_KEY_CONTEXT,
+    )
+    .root_hash()
+    .0
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    #[should_panic]
+    fn test_empty_subtree_should_panic() {
+        Hasher::new().finalize_non_root();
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_unaligned_offset_should_panic() {
+        Hasher::new().set_input_offset(1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_hasher_already_accepted_input_should_panic() {
+        Hasher::new().update(b"x").set_input_offset(0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_too_much_input_should_panic() {
+        Hasher::new()
+            .set_input_offset(CHUNK_LEN as u64)
+            .update(&[0; CHUNK_LEN + 1]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_set_input_offset_cant_finalize() {
+        Hasher::new().set_input_offset(CHUNK_LEN as u64).finalize();
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_set_input_offset_cant_finalize_xof() {
+        Hasher::new()
+            .set_input_offset(CHUNK_LEN as u64)
+            .finalize_xof();
+    }
+
+    #[test]
+    fn test_grouped_hash() {
+        const MAX_CHUNKS: usize = (crate::test::TEST_CASES_MAX + 1) / CHUNK_LEN;
+        let mut input_buf = [0; crate::test::TEST_CASES_MAX];
+        crate::test::paint_test_input(&mut input_buf);
+        for subtree_chunks in [1, 2, 4, 8, 16, 32] {
+            #[cfg(feature = "std")]
+            dbg!(subtree_chunks);
+            let subtree_len = subtree_chunks * CHUNK_LEN;
+            for &case in crate::test::TEST_CASES {
+                if case <= subtree_len {
+                    continue;
+                }
+                #[cfg(feature = "std")]
+                dbg!(case);
+                let input = &input_buf[..case];
+                let expected_hash = crate::hash(input);
+
+                // Collect all the group chaining values.
+                let mut chaining_values = arrayvec::ArrayVec::<ChainingValue, MAX_CHUNKS>::new();
+                let mut subtree_offset = 0;
+                while subtree_offset < input.len() {
+                    let take = core::cmp::min(subtree_len, input.len() - subtree_offset);
+                    let subtree_input = &input[subtree_offset..][..take];
+                    let subtree_cv = Hasher::new()
+                        .set_input_offset(subtree_offset as u64)
+                        .update(subtree_input)
+                        .finalize_non_root();
+                    chaining_values.push(subtree_cv);
+                    subtree_offset += take;
+                }
+
+                // Compress all the chaining_values together, layer by layer.
+                assert!(chaining_values.len() >= 2);
+                while chaining_values.len() > 2 {
+                    let n = chaining_values.len();
+                    // Merge each side-by-side pair in place, overwriting the front half of the
+                    // array with the merged results. This moves us "up one level" in the tree.
+                    for i in 0..(n / 2) {
+                        chaining_values[i] = merge_subtrees_non_root(
+                            &chaining_values[2 * i],
+                            &chaining_values[2 * i + 1],
+                            Mode::Hash,
+                        );
+                    }
+                    // If there's an odd CV out, it moves up.
+                    if n % 2 == 1 {
+                        chaining_values[n / 2] = chaining_values[n - 1];
+                    }
+                    chaining_values.truncate(n / 2 + n % 2);
+                }
+                assert_eq!(chaining_values.len(), 2);
+                let root_hash =
+                    merge_subtrees_root(&chaining_values[0], &chaining_values[1], Mode::Hash);
+                assert_eq!(expected_hash, root_hash);
+            }
+        }
+    }
+
+    #[test]
+    fn test_keyed_hash_xof() {
+        let group0 = &[42; 4096];
+        let group1 = &[43; 4095];
+        let mut input = [0; 8191];
+        input[..4096].copy_from_slice(group0);
+        input[4096..].copy_from_slice(group1);
+        let key = &[44; 32];
+
+        let mut expected_output = [0; 100];
+        Hasher::new_keyed(&key)
+            .update(&input)
+            .finalize_xof()
+            .fill(&mut expected_output);
+
+        let mut hazmat_output = [0; 100];
+        let left = Hasher::new_keyed(key).update(group0).finalize_non_root();
+        let right = Hasher::new_keyed(key)
+            .set_input_offset(group0.len() as u64)
+            .update(group1)
+            .finalize_non_root();
+        merge_subtrees_root_xof(&left, &right, Mode::KeyedHash(&key)).fill(&mut hazmat_output);
+        assert_eq!(expected_output, hazmat_output);
+    }
+
+    #[test]
+    fn test_derive_key() {
+        let context = "foo";
+        let mut input = [0; 1025];
+        crate::test::paint_test_input(&mut input);
+        let expected = crate::derive_key(context, &input);
+
+        let cx_key = hash_derive_key_context(context);
+        let left = Hasher::new_from_context_key(&cx_key)
+            .update(&input[..1024])
+            .finalize_non_root();
+        let right = Hasher::new_from_context_key(&cx_key)
+            .set_input_offset(1024)
+            .update(&input[1024..])
+            .finalize_non_root();
+        let derived_key = merge_subtrees_root(&left, &right, Mode::DeriveKeyMaterial(&cx_key)).0;
+        assert_eq!(expected, derived_key);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 92bdbdb..027d4ba 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -92,13 +92,12 @@
 #[cfg(test)]
 mod test;
 
-// The guts module is for incremental use cases like the `bao` crate that need
-// to explicitly compute chunk and parent chaining values. It is semi-stable
-// and likely to keep working, but largely undocumented and not intended for
-// widespread use.
 #[doc(hidden)]
+#[deprecated(since = "1.8.0", note = "use the hazmat module instead")]
 pub mod guts;
 
+pub mod hazmat;
+
 /// Undocumented and unstable, for benchmarks only.
 #[doc(hidden)]
 pub mod platform;
@@ -155,8 +154,20 @@ pub const OUT_LEN: usize = 32;
 /// The number of bytes in a key, 32.
 pub const KEY_LEN: usize = 32;
 
+/// The number of bytes in a block, 64.
+///
+/// You don't usually need to think about this number. One case where it matters is calling
+/// [`OutputReader::fill`] in a loop, where using a `buf` argument that's a multiple of `BLOCK_LEN`
+/// avoids repeating work.
+pub const BLOCK_LEN: usize = 64;
+
+/// The number of bytes in a chunk, 1024.
+///
+/// You don't usually need to think about this number, but it often comes up in benchmarks, because
+/// the maximum degree of parallelism used by the implementation equals the number of chunks.
+pub const CHUNK_LEN: usize = 1024;
+
 const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
-use guts::{BLOCK_LEN, CHUNK_LEN};
 
 // While iterating the compression function within a chunk, the CV is
 // represented as words, to avoid doing two extra endianness conversions for
@@ -227,7 +238,7 @@ fn counter_high(counter: u64) -> u32 {
 /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html
 /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html
 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
-#[derive(Clone, Copy, Hash)]
+#[derive(Clone, Copy, Hash, Eq)]
 pub struct Hash([u8; OUT_LEN]);
 
 impl Hash {
@@ -354,8 +365,6 @@ impl PartialEq<[u8]> for Hash {
     }
 }
 
-impl Eq for Hash {}
-
 impl fmt::Display for Hash {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // Formatting field as `&str` to reduce code size since the `Debug`
@@ -504,7 +513,7 @@ impl ChunkState {
         }
     }
 
-    fn len(&self) -> usize {
+    fn count(&self) -> usize {
         BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize
     }
 
@@ -561,7 +570,7 @@ impl ChunkState {
 
         self.fill_buf(&mut input);
         debug_assert!(input.is_empty());
-        debug_assert!(self.len() <= CHUNK_LEN);
+        debug_assert!(self.count() <= CHUNK_LEN);
         self
     }
 
@@ -582,7 +591,7 @@ impl ChunkState {
 impl fmt::Debug for ChunkState {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         f.debug_struct("ChunkState")
-            .field("len", &self.len())
+            .field("count", &self.count())
             .field("chunk_counter", &self.chunk_counter)
             .field("flags", &self.flags)
             .field("platform", &self.platform)
@@ -646,22 +655,12 @@ impl IncrementCounter {
     }
 }
 
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
+// The largest power of two less than or equal to `n`, used in Hasher::update(). This is similar to
+// left_subtree_len(n), but note that left_subtree_len(n) is strictly less than `n`.
 fn largest_power_of_two_leq(n: usize) -> usize {
     ((n / 2) + 1).next_power_of_two()
 }
 
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-fn left_len(content_len: usize) -> usize {
-    debug_assert!(content_len > CHUNK_LEN);
-    // Subtract 1 to reserve at least one byte for the right side.
-    let full_chunks = (content_len - 1) / CHUNK_LEN;
-    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
 // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
 // on a single thread. Write out the chunk chaining values and return the
 // number of chunks hashed. These chunks are never the root and never empty;
@@ -790,7 +789,7 @@ fn compress_subtree_wide<J: join::Join>(
     // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
     // of 3 or something, we'll need a more complicated strategy.)
     debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
-    let (left, right) = input.split_at(left_len(input.len()));
+    let (left, right) = input.split_at(hazmat::left_subtree_len(input.len() as u64) as usize);
     let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
 
     // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
@@ -996,10 +995,8 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
 pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
-    let context_key =
-        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
-            .root_hash();
-    let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
+    let context_key = hazmat::hash_derive_key_context(context);
+    let context_key_words = platform::words_from_le_bytes_32(&context_key);
     hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
         .root_hash()
         .0
@@ -1063,6 +1060,7 @@ fn parent_node_output(
 pub struct Hasher {
     key: CVWords,
     chunk_state: ChunkState,
+    initial_chunk_counter: u64,
     // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
     // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
     // requires a 4th entry, rather than merging everything down to 1, because
@@ -1076,6 +1074,7 @@ impl Hasher {
         Self {
             key: *key,
             chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+            initial_chunk_counter: 0,
             cv_stack: ArrayVec::new(),
         }
     }
@@ -1100,10 +1099,8 @@ impl Hasher {
     ///
     /// [`derive_key`]: fn.derive_key.html
     pub fn new_derive_key(context: &str) -> Self {
-        let context_key =
-            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
-                .root_hash();
-        let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
+        let context_key = hazmat::hash_derive_key_context(context);
+        let context_key_words = platform::words_from_le_bytes_32(&context_key);
         Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
     }
 
@@ -1133,8 +1130,12 @@ impl Hasher {
     // the CV on top of the stack. The principle is the same: each CV that
     // should remain in the stack is represented by a 1-bit in the total number
     // of chunks (or bytes) so far.
-    fn merge_cv_stack(&mut self, total_len: u64) {
-        let post_merge_stack_len = total_len.count_ones() as usize;
+    fn merge_cv_stack(&mut self, chunk_counter: u64) {
+        // Account for non-zero cases of Hasher::set_input_offset, where there are no prior
+        // subtrees in the stack. Note that initial_chunk_counter is always 0 for callers who don't
+        // use the hazmat module.
+        let post_merge_stack_len =
+            (chunk_counter - self.initial_chunk_counter).count_ones() as usize;
         while self.cv_stack.len() > post_merge_stack_len {
             let right_child = self.cv_stack.pop().unwrap();
             let left_child = self.cv_stack.pop().unwrap();
@@ -1199,10 +1200,21 @@ impl Hasher {
     }
 
     fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
+        let input_offset = self.initial_chunk_counter * CHUNK_LEN as u64;
+        if let Some(max) = hazmat::max_subtree_len(input_offset) {
+            let remaining = max - self.count();
+            assert!(
+                input.len() as u64 <= remaining,
+                "the subtree starting at {} contains at most {} bytes (found {})",
+                CHUNK_LEN as u64 * self.initial_chunk_counter,
+                max,
+                input.len(),
+            );
+        }
         // If we have some partial chunk bytes in the internal chunk_state, we
         // need to finish that chunk first.
-        if self.chunk_state.len() > 0 {
-            let want = CHUNK_LEN - self.chunk_state.len();
+        if self.chunk_state.count() > 0 {
+            let want = CHUNK_LEN - self.chunk_state.count();
             let take = cmp::min(want, input.len());
             self.chunk_state.update(&input[..take]);
             input = &input[take..];
@@ -1210,7 +1222,7 @@ impl Hasher {
                 // We've filled the current chunk, and there's more input
                 // coming, so we know it's not the root and we can finalize it.
                 // Then we'll proceed to hashing whole chunks below.
-                debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
+                debug_assert_eq!(self.chunk_state.count(), CHUNK_LEN);
                 let chunk_cv = self.chunk_state.output().chaining_value();
                 self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
                 self.chunk_state = ChunkState::new(
@@ -1238,7 +1250,7 @@ impl Hasher {
         // Because we might need to break up the input to form powers of 2, or
         // to evenly divide what we already have, this part runs in a loop.
         while input.len() > CHUNK_LEN {
-            debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
+            debug_assert_eq!(self.chunk_state.count(), 0, "no partial chunk data");
             debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
             let mut subtree_len = largest_power_of_two_leq(input.len());
             let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
@@ -1323,7 +1335,7 @@ impl Hasher {
         // also. Convert it directly into an Output. Otherwise, we need to
         // merge subtrees below.
         if self.cv_stack.is_empty() {
-            debug_assert_eq!(self.chunk_state.chunk_counter, 0);
+            debug_assert_eq!(self.chunk_state.chunk_counter, self.initial_chunk_counter);
             return self.chunk_state.output();
         }
 
@@ -1341,11 +1353,11 @@ impl Hasher {
         // the empty chunk is taken care of above.
         let mut output: Output;
         let mut num_cvs_remaining = self.cv_stack.len();
-        if self.chunk_state.len() > 0 {
+        if self.chunk_state.count() > 0 {
             debug_assert_eq!(
                 self.cv_stack.len(),
-                self.chunk_state.chunk_counter.count_ones() as usize,
-                "cv stack does not need a merge"
+                (self.chunk_state.chunk_counter - self.initial_chunk_counter).count_ones() as usize,
+                "cv stack does not need a merge",
             );
             output = self.chunk_state.output();
         } else {
@@ -1378,6 +1390,10 @@ impl Hasher {
     /// This method is idempotent. Calling it twice will give the same result.
     /// You can also add more input and finalize again.
     pub fn finalize(&self) -> Hash {
+        assert_eq!(
+            self.initial_chunk_counter, 0,
+            "set_input_offset must be used with finalize_non_root",
+        );
         self.final_output().root_hash()
     }
 
@@ -1389,12 +1405,22 @@ impl Hasher {
     ///
     /// [`OutputReader`]: struct.OutputReader.html
     pub fn finalize_xof(&self) -> OutputReader {
+        assert_eq!(
+            self.initial_chunk_counter, 0,
+            "set_input_offset must be used with finalize_non_root",
+        );
         OutputReader::new(self.final_output())
     }
 
     /// Return the total number of bytes hashed so far.
+    ///
+    /// [`hazmat::HasherExt::set_input_offset`] does not affect this value. This only counts bytes
+    /// passed to [`update`](Hasher::update).
     pub fn count(&self) -> u64 {
-        self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
+        // Account for non-zero cases of Hasher::set_input_offset. Note that initial_chunk_counter
+        // is always 0 for callers who don't use the hazmat module.
+        (self.chunk_state.chunk_counter - self.initial_chunk_counter) * CHUNK_LEN as u64
+            + self.chunk_state.count() as u64
     }
 
     /// As [`update`](Hasher::update), but reading from a
@@ -1613,11 +1639,13 @@ impl Zeroize for Hasher {
         let Self {
             key,
             chunk_state,
+            initial_chunk_counter,
             cv_stack,
         } = self;
 
         key.zeroize();
         chunk_state.zeroize();
+        initial_chunk_counter.zeroize();
         cv_stack.zeroize();
     }
 }
@@ -1684,7 +1712,7 @@ impl OutputReader {
     /// calling `fill` repeatedly with a short-length or odd-length slice will
     /// end up performing the same compression multiple times. If you're
     /// reading output in a loop, prefer a slice length that's a multiple of
-    /// 64.
+    /// [`BLOCK_LEN`] (64 bytes).
     ///
     /// The maximum output size of BLAKE3 is 2<sup>64</sup>-1 bytes. If you try
     /// to extract more than that, for example by seeking near the end and
diff --git a/src/test.rs b/src/test.rs
index 506d7aa..2d21856 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -38,8 +38,12 @@ pub const TEST_CASES: &[usize] = &[
     7 * CHUNK_LEN + 1,
     8 * CHUNK_LEN,
     8 * CHUNK_LEN + 1,
-    16 * CHUNK_LEN,  // AVX512's bandwidth
-    31 * CHUNK_LEN,  // 16 + 8 + 4 + 2 + 1
+    16 * CHUNK_LEN - 1,
+    16 * CHUNK_LEN, // AVX512's bandwidth
+    16 * CHUNK_LEN + 1,
+    31 * CHUNK_LEN - 1,
+    31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1
+    31 * CHUNK_LEN + 1,
     100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks
 ];
 
@@ -327,22 +331,6 @@ fn test_largest_power_of_two_leq() {
     }
 }
 
-#[test]
-fn test_left_len() {
-    let input_output = &[
-        (CHUNK_LEN + 1, CHUNK_LEN),
-        (2 * CHUNK_LEN - 1, CHUNK_LEN),
-        (2 * CHUNK_LEN, CHUNK_LEN),
-        (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(crate::left_len(input), output);
-    }
-}
-
 #[test]
 fn test_compare_reference_impl() {
     const OUT: usize = 303; // more than 64, not a multiple of 4
@@ -801,6 +789,7 @@ fn test_zeroize() {
             flags: 42,
             platform: crate::Platform::Portable,
         },
+        initial_chunk_counter: 42,
         key: [42; 8],
         cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(),
     };
@@ -815,6 +804,7 @@ fn test_zeroize() {
         hasher.chunk_state.platform,
         crate::Platform::Portable
     ));
+    assert_eq!(hasher.initial_chunk_counter, 0);
     assert_eq!(hasher.key, [0; 8]);
     assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
 
@@ -1020,3 +1010,40 @@ fn test_miri_smoketest() {
     reader.set_position(999999);
     reader.fill(&mut [0]);
 }
+
+// I had to move these tests out of the deprecated guts module, because leaving them there causes
+// an un-silenceable warning: https://github.com/rust-lang/rust/issues/47238
+#[cfg(test)]
+#[allow(deprecated)]
+mod guts_tests {
+    use crate::guts::*;
+
+    #[test]
+    fn test_chunk() {
+        assert_eq!(
+            crate::hash(b"foo"),
+            ChunkState::new(0).update(b"foo").finalize(true)
+        );
+    }
+
+    #[test]
+    fn test_parents() {
+        let mut hasher = crate::Hasher::new();
+        let mut buf = [0; crate::CHUNK_LEN];
+
+        buf[0] = 'a' as u8;
+        hasher.update(&buf);
+        let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false);
+
+        buf[0] = 'b' as u8;
+        hasher.update(&buf);
+        let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false);
+
+        hasher.update(b"c");
+        let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false);
+
+        let parent = parent_cv(&chunk0_cv, &chunk1_cv, false);
+        let root = parent_cv(&parent, &chunk2_cv, true);
+        assert_eq!(hasher.finalize(), root);
+    }
+}
diff --git a/test_vectors/src/lib.rs b/test_vectors/src/lib.rs
index 543a31d..8df28cd 100644
--- a/test_vectors/src/lib.rs
+++ b/test_vectors/src/lib.rs
@@ -1,4 +1,4 @@
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
+use blake3::{BLOCK_LEN, CHUNK_LEN};
 use serde::{Deserialize, Serialize};
 
 // Reading files at runtime requires special configuration under WASM/WASI, so including this at
-- 
cgit v1.2.3