add *_rayon methods

author: Jack O'Connor <[email protected]> 2021-02-28 21:11:21 -0500
committer: Jack O'Connor <[email protected]> 2021-03-14 00:26:18 -0500
commit: b228f46e0308d9c48d19ee077a2f73a402fa26c3 (patch)
tree: c54ab2fbad167197e3a84eff1dd9ea2ec942d77c
parent: ea72822620ba77e4f597bf6d6bd4bd8c3b4cc9dc (diff)
6 files changed, 137 insertions, 55 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 567a008..563375c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,10 +25,9 @@ neon = []
 # entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
 std = ["digest/std"]
 
-# The "rayon" feature (defined below as an optional dependency) enables the
-# join::RayonJoin type, which can be used with Hasher::update_with_join to
-# perform multi-threaded hashing. However, even if this feature is enabled, all
-# other APIs remain single-threaded.
+# The "rayon" feature (defined below as an optional dependency) enables API
+# functions like `hash_rayon` and `update_rayon`. However, even if this feature
+# is enabled, all other APIs remain single-threaded.
 
 # ---------- Features below this line are for internal testing only. ----------
 
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index b01e5de..3810bfe 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -219,7 +219,7 @@ impl Input {
             // multiple threads. This doesn't work on stdin, or on some files,
             // and it can also be disabled with --no-mmap.
             Self::Mmap(cursor) => {
-                hasher.update_with_join::<blake3::join::RayonJoin>(cursor.get_ref());
+                hasher.update_rayon(cursor.get_ref());
             }
             // The slower paths, for stdin or files we didn't/couldn't mmap.
             // This is currently all single-threaded. Doing multi-threaded
diff --git a/benches/bench.rs b/benches/bench.rs
index ba5a404..832f0f8 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -421,11 +421,7 @@ fn bench_reference_1024_kib(b: &mut Bencher) {
 #[cfg(feature = "rayon")]
 fn bench_rayon(b: &mut Bencher, len: usize) {
     let mut input = RandomInput::new(b, len);
-    b.iter(|| {
-        blake3::Hasher::new()
-            .update_with_join::<blake3::join::RayonJoin>(input.get())
-            .finalize()
-    });
+    b.iter(|| blake3::hash_rayon(input.get()));
 }
 
 #[bench]
diff --git a/src/join.rs b/src/join.rs
index 2435bc6..227216a 100644
--- a/src/join.rs
+++ b/src/join.rs
@@ -1,11 +1,12 @@
-//! The multi-threading abstractions used by [`Hasher::update_with_join`].
+//! The multi-threading abstractions used by `Hasher::update_with_join`.
 //!
 //! Different implementations of the `Join` trait determine whether
-//! [`Hasher::update_with_join`] performs multi-threading on sufficiently large
+//! `Hasher::update_with_join` performs multi-threading on sufficiently large
 //! inputs. The `SerialJoin` implementation is single-threaded, and the
-//! `RayonJoin` implementation (gated by the `rayon` feature) is
-//! multi-threaded. Interfaces other than [`Hasher::update_with_join`], like
-//! [`hash`] and [`Hasher::update`], always use `SerialJoin` internally.
+//! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded.
+//! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash)
+//! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin`
+//! internally.
 //!
 //! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and
 //! `RayonJoin` is the only non-trivial implementation. Previously this trait
diff --git a/src/lib.rs b/src/lib.rs
index 8b246cc..effa31a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -32,9 +32,9 @@
 //!
 //! # Cargo Features
 //!
-//! The `rayon` feature provides [Rayon]-based multi-threading, in particular
-//! the [`join::RayonJoin`] type for use with [`Hasher::update_with_join`]. It
-//! is disabled by default, but enabled for [docs.rs].
+//! The `rayon` feature provides [Rayon]-based multi-threading, via functions
+//! with the `_rayon` suffix. It is disabled by default, but enabled for
+//! [docs.rs].
 //!
 //! The `neon` feature enables ARM NEON support. Currently there is no runtime
 //! CPU feature detection for NEON, so you must only enable this feature for
@@ -107,7 +107,6 @@ use arrayref::{array_mut_ref, array_ref};
 use arrayvec::{ArrayString, ArrayVec};
 use core::cmp;
 use core::fmt;
-use join::{Join, SerialJoin};
 use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};
 
 /// The number of bytes in a [`Hash`](struct.Hash.html), 32.
@@ -659,7 +658,7 @@ fn compress_parents_parallel(
 // Why not just have the caller split the input on the first update(), instead
 // of implementing this special rule? Because we don't want to limit SIMD or
 // multi-threading parallelism for that update().
-fn compress_subtree_wide<J: Join>(
+fn compress_subtree_wide<J: join::Join>(
     input: &[u8],
     key: &CVWords,
     chunk_counter: u64,
@@ -733,7 +732,7 @@ fn compress_subtree_wide<J: Join>(
 //
 // As with compress_subtree_wide(), this function is not used on inputs of 1
 // chunk or less. That's a different codepath.
-fn compress_subtree_to_parent_node<J: Join>(
+fn compress_subtree_to_parent_node<J: join::Join>(
     input: &[u8],
     key: &CVWords,
     chunk_counter: u64,
@@ -761,7 +760,7 @@ fn compress_subtree_to_parent_node<J: Join>(
 // Hash a complete input all at once. Unlike compress_subtree_wide() and
 // compress_subtree_to_parent_node(), this function handles the 1 chunk case.
 // Note that this we use SerialJoin here, so this is always single-threaded.
-fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
+fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Output {
     let platform = Platform::detect();
 
     // If the whole subtree is one chunk, hash it directly with a ChunkState.
@@ -775,7 +774,7 @@ fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
     // compress_subtree_to_parent_node().
     Output {
         input_chaining_value: *key,
-        block: compress_subtree_to_parent_node::<SerialJoin>(input, key, 0, flags, platform),
+        block: compress_subtree_to_parent_node::<J>(input, key, 0, flags, platform),
         block_len: BLOCK_LEN as u8,
         counter: 0,
         flags: flags | PARENT,
@@ -792,9 +791,22 @@ fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
 /// [`OutputReader`].
 ///
 /// This function is always single-threaded. For multi-threading support, see
-/// [`Hasher::update_with_join`].
+/// the [`hash_rayon`](hash_rayon) function (enabled by the `rayon` Cargo
+/// feature).
 pub fn hash(input: &[u8]) -> Hash {
-    hash_all_at_once(input, IV, 0).root_hash()
+    hash_all_at_once::<join::SerialJoin>(input, IV, 0).root_hash()
+}
+
+/// Like [`hash`], but using Rayon-based multithreading used as a performance
+/// optimization.
+///
+/// To get any performance benefit from multi-threading, the input needs to be
+/// very large. As a rule of thumb on x86_64, there is no benefit to
+/// multi-threading inputs less than 128 KiB. Other platforms have different
+/// thresholds, and in general you need to benchmark your specific use case.
+#[cfg(feature = "rayon")]
+pub fn hash_rayon(input: &[u8]) -> Hash {
+    hash_all_at_once::<join::RayonJoin>(input, IV, 0).root_hash()
 }
 
 /// The keyed hash function.
@@ -809,10 +821,24 @@ pub fn hash(input: &[u8]) -> Hash {
 /// [`Hasher::finalize_xof`], and [`OutputReader`].
 ///
 /// This function is always single-threaded. For multi-threading support, see
-/// [`Hasher::update_with_join`].
+/// the [`keyed_hash_rayon`](keyed_hash_rayon) function (enabled by the `rayon`
+/// Cargo feature).
 pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
     let key_words = platform::words_from_le_bytes_32(key);
-    hash_all_at_once(input, &key_words, KEYED_HASH).root_hash()
+    hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash()
+}
+
+/// Like [`keyed_hash`], but using Rayon-based multithreading as a performance
+/// optimization.
+///
+/// To get any performance benefit from multi-threading, the input needs to be
+/// very large. As a rule of thumb on x86_64, there is no benefit to
+/// multi-threading inputs less than 128 KiB. Other platforms have different
+/// thresholds, and in general you need to benchmark your specific use case.
+#[cfg(feature = "rayon")]
+pub fn keyed_hash_rayon(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
+    let key_words = platform::words_from_le_bytes_32(key);
+    hash_all_at_once::<join::RayonJoin>(input, &key_words, KEYED_HASH).root_hash()
 }
 
 /// The key derivation function.
@@ -845,13 +871,36 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 /// [`Hasher::finalize_xof`], and [`OutputReader`].
 ///
 /// This function is always single-threaded. For multi-threading support, see
-/// [`Hasher::update_with_join`].
+/// the [`derive_key_rayon`](derive_key_rayon) function (enabled by the `rayon`
+/// Cargo feature).
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
 pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
-    let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash();
+    let context_key =
+        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+            .root_hash();
     let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
-    hash_all_at_once(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
+    hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
+        .root_hash()
+        .0
+}
+
+/// Like [`derive_key`], but using Rayon-based multithreading as a performance
+/// optimization.
+///
+/// To get any performance benefit from multi-threading, the input needs to be
+/// very large. As a rule of thumb on x86_64, there is no benefit to
+/// multi-threading inputs less than 128 KiB. Other platforms have different
+/// thresholds, and in general you need to benchmark your specific use case.
+#[cfg(feature = "rayon")]
+pub fn derive_key_rayon(context: &str, key_material: &[u8]) -> [u8; 32] {
+    // There is no conceivable reason anyone should use a context string long
+    // enough for multithreading to make a difference.
+    let context_key =
+        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+            .root_hash();
+    let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
+    hash_all_at_once::<join::RayonJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
         .root_hash()
         .0
 }
@@ -882,10 +931,10 @@ fn parent_node_output(
 /// used traits from the [`digest`](https://crates.io/crates/digest) and
 /// [`crypto_mac`](https://crates.io/crates/crypto-mac) crates.
 ///
-/// **Performance note:** The [`update`] and [`update_with_join`] methods
-/// perform poorly when the caller's input buffer is small. See their method
-/// docs below. A 16 KiB buffer is large enough to leverage all currently
-/// supported SIMD instruction sets.
+/// **Performance note:** The [`update`] and (if the `rayon` Cargo feature is
+/// enabled) [`update_rayon`] methods perform poorly when the caller's input
+/// buffer is small. See their method docs below. A 16 KiB buffer is large
+/// enough to leverage all currently supported SIMD instruction sets.
 ///
 /// # Examples
 ///
@@ -952,7 +1001,9 @@ impl Hasher {
     ///
     /// [`derive_key`]: fn.derive_key.html
     pub fn new_derive_key(context: &str) -> Self {
-        let context_key = hash_all_at_once(context.as_bytes(), IV, DERIVE_KEY_CONTEXT).root_hash();
+        let context_key =
+            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+                .root_hash();
         let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
         Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
     }
@@ -1054,12 +1105,11 @@ impl Hasher {
     ///
     /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html
     pub fn update(&mut self, input: &[u8]) -> &mut Self {
-        self.update_with_join::<SerialJoin>(input)
+        self.update_with_join::<join::SerialJoin>(input)
     }
 
-    /// Add input bytes to the hash state, as with `update`, but potentially
-    /// using multi-threading. See the example below, and the
-    /// [`join`](join/index.html) module for a more detailed explanation.
+    /// Like [`update`](Hasher::update), but using Rayon-based multithreading as
+    /// a performance optimization.
     ///
     /// To get any performance benefit from multi-threading, the input buffer
     /// size needs to be very large. As a rule of thumb on x86_64, there is no
@@ -1087,11 +1137,16 @@ impl Hasher {
     /// # fn some_large_input() -> &'static [u8] { b"foo" }
     /// let input: &[u8] = some_large_input();
     /// let mut hasher = blake3::Hasher::new();
-    /// hasher.update_with_join::<blake3::join::RayonJoin>(input);
+    /// hasher.update_rayon(input);
     /// let hash = hasher.finalize();
     /// # }
     /// ```
-    pub fn update_with_join<J: Join>(&mut self, mut input: &[u8]) -> &mut Self {
+    #[cfg(feature = "rayon")]
+    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
+        self.update_with_join::<join::RayonJoin>(input)
+    }
+
+    fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
         // If we have some partial chunk bytes in the internal chunk_state, we
         // need to finish that chunk first.
         if self.chunk_state.len() > 0 {
diff --git a/src/test.rs b/src/test.rs
index b99892c..1ebf27f 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -283,12 +283,26 @@ fn test_compare_reference_impl() {
 
             // all at once
             let test_out = crate::hash(input);
-            assert_eq!(test_out, expected_out[..32]);
+            assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            // all at once (rayon)
+            #[cfg(feature = "rayon")]
+            {
+                let test_out = crate::hash_rayon(input);
+                assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            }
             // incremental
             let mut hasher = crate::Hasher::new();
             hasher.update(input);
             assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
             assert_eq!(hasher.finalize(), test_out);
+            // incremental (rayon)
+            #[cfg(feature = "rayon")]
+            {
+                let mut hasher = crate::Hasher::new();
+                hasher.update_rayon(input);
+                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), test_out);
+            }
             // xof
             let mut extended = [0; OUT];
             hasher.finalize_xof().fill(&mut extended);
@@ -304,12 +318,26 @@ fn test_compare_reference_impl() {
 
             // all at once
             let test_out = crate::keyed_hash(&TEST_KEY, input);
-            assert_eq!(test_out, expected_out[..32]);
+            assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            // all at once (rayon)
+            #[cfg(feature = "rayon")]
+            {
+                let test_out = crate::keyed_hash_rayon(&TEST_KEY, input);
+                assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            }
             // incremental
             let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
             hasher.update(input);
             assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
             assert_eq!(hasher.finalize(), test_out);
+            // incremental (rayon)
+            #[cfg(feature = "rayon")]
+            {
+                let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+                hasher.update_rayon(input);
+                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), test_out);
+            }
             // xof
             let mut extended = [0; OUT];
             hasher.finalize_xof().fill(&mut extended);
@@ -326,12 +354,26 @@ fn test_compare_reference_impl() {
 
             // all at once
             let test_out = crate::derive_key(context, input);
-            assert_eq!(test_out[..], expected_out[..32]);
+            assert_eq!(test_out, expected_out[..32]);
+            // all at once (rayon)
+            #[cfg(feature = "rayon")]
+            {
+                let test_out = crate::derive_key_rayon(context, input);
+                assert_eq!(test_out, expected_out[..32]);
+            }
             // incremental
             let mut hasher = crate::Hasher::new_derive_key(context);
             hasher.update(input);
             assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
             assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+            // incremental (rayon)
+            #[cfg(feature = "rayon")]
+            {
+                let mut hasher = crate::Hasher::new_derive_key(context);
+                hasher.update_rayon(input);
+                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+            }
             // xof
             let mut extended = [0; OUT];
             hasher.finalize_xof().fill(&mut extended);
@@ -504,17 +546,6 @@ fn test_reset() {
 }
 
 #[test]
-#[cfg(feature = "rayon")]
-fn test_update_with_rayon_join() {
-    let mut input = [0; TEST_CASES_MAX];
-    paint_test_input(&mut input);
-    let rayon_hash = crate::Hasher::new()
-        .update_with_join::<crate::join::RayonJoin>(&input)
-        .finalize();
-    assert_eq!(crate::hash(&input), rayon_hash);
-}
-
-#[test]
 fn test_hex_encoding_decoding() {
     let digest_str = "04e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9";
     let mut hasher = crate::Hasher::new();
author	Jack O'Connor <[email protected]>	2021-02-28 21:11:21 -0500
committer	Jack O'Connor <[email protected]>	2021-03-14 00:26:18 -0500
commit	b228f46e0308d9c48d19ee077a2f73a402fa26c3 (patch)
tree	c54ab2fbad167197e3a84eff1dd9ea2ec942d77c
parent	ea72822620ba77e4f597bf6d6bd4bd8c3b4cc9dc (diff)