4 files changed, 71 insertions, 18 deletions
diff --git a/benches/bench.rs b/benches/bench.rs
index 70be967..263f81e 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -475,3 +475,22 @@ fn bench_rayon_0512_kib(b: &mut Bencher) {
 fn bench_rayon_1024_kib(b: &mut Bencher) {
     bench_rayon(b, 1024 * KIB);
 }
+
+// This checks that update() splits up its input in increasing powers of 2, so
+// that it can recover a high degree of parallelism when the number of bytes
+// hashed so far is uneven. The performance of this benchmark should be
+// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
+// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
+// performance was less than half.
+#[bench]
+fn bench_two_updates(b: &mut Bencher) {
+    let len = 65536;
+    let mut input = RandomInput::new(b, len);
+    b.iter(|| {
+        let mut hasher = blake3::Hasher::new();
+        let input = input.get();
+        hasher.update(&input[..1]);
+        hasher.update(&input[1..]);
+        hasher.finalize()
+    });
+}
diff --git a/c/blake3.c b/c/blake3.c
index b4a5992..2f224a1 100644
--- a/c/blake3.c
+++ b/c/blake3.c
@@ -486,16 +486,22 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
   while (input_len > BLAKE3_CHUNK_LEN) {
     size_t subtree_len = round_down_to_power_of_2(input_len);
     uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
-    // Shrink the subtree_len until *half of it* it evenly divides the count so
-    // far. Why half? Because compress_subtree_to_parent_node will return a
-    // pair of chaining values, each representing half of the input. As long as
-    // those evenly divide the input so far, we're good. We know that
-    // subtree_len itself is a power of 2, so we can use a bitmasking trick
-    // instead of an actual remainder operation. (Note that if the caller
+    // Shrink the subtree_len until it evenly divides the count so far. We know
+    // that subtree_len itself is a power of 2, so we can use a bitmasking
+    // trick instead of an actual remainder operation. (Note that if the caller
     // consistently passes power-of-2 inputs of the same size, as is hopefully
     // typical, this loop condition will always fail, and subtree_len will
     // always be the full length of the input.)
-    while ((((uint64_t)((subtree_len / 2) - 1)) & count_so_far) != 0) {
+    //
+    // An aside: We don't have to shrink subtree_len quite this much. For
+    // example, if count_so_far is 1, we could pass 2 chunks to
+    // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+    // get the right answer in the end, and we might get to use 2-way SIMD
+    // parallelism. The problem with this optimization, is that it gets us
+    // stuck always hashing 2 chunks. The total number of chunks will remain
+    // odd, and we'll never graduate to higher degrees of parallelism. See
+    // https://github.com/BLAKE3-team/BLAKE3/issues/69.
+    while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
       subtree_len /= 2;
     }
     // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
diff --git a/c/blake3_c_rust_bindings/benches/bench.rs b/c/blake3_c_rust_bindings/benches/bench.rs
index 36b6211..4335a26 100644
--- a/c/blake3_c_rust_bindings/benches/bench.rs
+++ b/c/blake3_c_rust_bindings/benches/bench.rs
@@ -332,3 +332,24 @@ fn bench_incremental_0512_kib(b: &mut Bencher) {
 fn bench_incremental_1024_kib(b: &mut Bencher) {
     bench_incremental(b, 1024 * KIB);
 }
+
+// This checks that update() splits up its input in increasing powers of 2, so
+// that it can recover a high degree of parallelism when the number of bytes
+// hashed so far is uneven. The performance of this benchmark should be
+// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
+// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
+// performance was less than half.
+#[bench]
+fn bench_two_updates(b: &mut Bencher) {
+    let len = 65536;
+    let mut input = RandomInput::new(b, len);
+    b.iter(|| {
+        let mut hasher = blake3_c_rust_bindings::Hasher::new();
+        let input = input.get();
+        hasher.update(&input[..1]);
+        hasher.update(&input[1..]);
+        let mut out = [0; 32];
+        hasher.finalize(&mut out);
+        out
+    });
+}
diff --git a/src/lib.rs b/src/lib.rs
index d2c523b..53e9239 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1023,17 +1023,24 @@ impl Hasher {
             debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
             let mut subtree_len = largest_power_of_two_leq(input.len());
             let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
-            // Shrink the subtree_len until *half of it* it evenly divides the
-            // count so far. Why half? Because compress_subtree_to_parent_node
-            // will return a pair of chaining values, each representing half of
-            // the input. As long as those evenly divide the input so far,
-            // we're good. We know that subtree_len itself is a power of 2, so
-            // we can use a bitmasking trick instead of an actual remainder
-            // operation. (Note that if the caller consistently passes
-            // power-of-2 inputs of the same size, as is hopefully typical,
-            // this loop condition will always fail, and subtree_len will
-            // always be the full length of the input.)
-            while ((subtree_len / 2) - 1) as u64 & count_so_far != 0 {
+            // Shrink the subtree_len until it evenly divides the count so far.
+            // We know that subtree_len itself is a power of 2, so we can use a
+            // bitmasking trick instead of an actual remainder operation. (Note
+            // that if the caller consistently passes power-of-2 inputs of the
+            // same size, as is hopefully typical, this loop condition will
+            // always fail, and subtree_len will always be the full length of
+            // the input.)
+            //
+            // An aside: We don't have to shrink subtree_len quite this much.
+            // For example, if count_so_far is 1, we could pass 2 chunks to
+            // compress_subtree_to_parent_node. Since we'll get 2 CVs back,
+            // we'll still get the right answer in the end, and we might get to
+            // use 2-way SIMD parallelism. The problem with this optimization,
+            // is that it gets us stuck always hashing 2 chunks. The total
+            // number of chunks will remain odd, and we'll never graduate to
+            // higher degrees of parallelism. See
+            // https://github.com/BLAKE3-team/BLAKE3/issues/69.
+            while (subtree_len - 1) as u64 & count_so_far != 0 {
                 subtree_len /= 2;
             }
             // The shrunken subtree_len might now be 1 chunk long. If so, hash