diff options
| -rw-r--r-- | benches/bench.rs | 19 | ||||
| -rw-r--r-- | c/blake3.c | 20 | ||||
| -rw-r--r-- | c/blake3_c_rust_bindings/benches/bench.rs | 21 | ||||
| -rw-r--r-- | src/lib.rs | 29 |
4 files changed, 71 insertions, 18 deletions
diff --git a/benches/bench.rs b/benches/bench.rs index 70be967..263f81e 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -475,3 +475,22 @@ fn bench_rayon_0512_kib(b: &mut Bencher) { fn bench_rayon_1024_kib(b: &mut Bencher) { bench_rayon(b, 1024 * KIB); } + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + hasher.finalize() + }); +} @@ -486,16 +486,22 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, while (input_len > BLAKE3_CHUNK_LEN) { size_t subtree_len = round_down_to_power_of_2(input_len); uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; - // Shrink the subtree_len until *half of it* it evenly divides the count so - // far. Why half? Because compress_subtree_to_parent_node will return a - // pair of chaining values, each representing half of the input. As long as - // those evenly divide the input so far, we're good. We know that - // subtree_len itself is a power of 2, so we can use a bitmasking trick - // instead of an actual remainder operation. (Note that if the caller + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller // consistently passes power-of-2 inputs of the same size, as is hopefully // typical, this loop condition will always fail, and subtree_len will // always be the full length of the input.) - while ((((uint64_t)((subtree_len / 2) - 1)) & count_so_far) != 0) { + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash that one diff --git a/c/blake3_c_rust_bindings/benches/bench.rs b/c/blake3_c_rust_bindings/benches/bench.rs index 36b6211..4335a26 100644 --- a/c/blake3_c_rust_bindings/benches/bench.rs +++ b/c/blake3_c_rust_bindings/benches/bench.rs @@ -332,3 +332,24 @@ fn bench_incremental_0512_kib(b: &mut Bencher) { fn bench_incremental_1024_kib(b: &mut Bencher) { bench_incremental(b, 1024 * KIB); } + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} @@ -1023,17 +1023,24 @@ impl Hasher { debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); let mut subtree_len = largest_power_of_two_leq(input.len()); let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; - // Shrink the subtree_len until *half of it* it evenly divides the - // count so far. Why half? Because compress_subtree_to_parent_node - // will return a pair of chaining values, each representing half of - // the input. As long as those evenly divide the input so far, - // we're good. We know that subtree_len itself is a power of 2, so - // we can use a bitmasking trick instead of an actual remainder - // operation. (Note that if the caller consistently passes - // power-of-2 inputs of the same size, as is hopefully typical, - // this loop condition will always fail, and subtree_len will - // always be the full length of the input.) - while ((subtree_len / 2) - 1) as u64 & count_so_far != 0 { + // Shrink the subtree_len until it evenly divides the count so far. + // We know that subtree_len itself is a power of 2, so we can use a + // bitmasking trick instead of an actual remainder operation. (Note + // that if the caller consistently passes power-of-2 inputs of the + // same size, as is hopefully typical, this loop condition will + // always fail, and subtree_len will always be the full length of + // the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. + // For example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, + // we'll still get the right answer in the end, and we might get to + // use 2-way SIMD parallelism. The problem with this optimization, + // is that it gets us stuck always hashing 2 chunks. The total + // number of chunks will remain odd, and we'll never graduate to + // higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while (subtree_len - 1) as u64 & count_so_far != 0 { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash |
