replace the new file module with inherent methods on Hasher

New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable, more convenient, and safer. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions.
author: Jack O'Connor <[email protected]> 2023-09-10 15:18:11 -0700
committer: Jack O'Connor <[email protected]> 2023-09-16 17:04:27 -0700
commit: cb32f0bd1450991d86a8ceb3716fd591382cc507 (patch)
tree: f0112db7d1a4452facd92111eb7134a20270d60e
parent: e0bb91564125407102af81e219399025aa2c24b9 (diff)
9 files changed, 466 insertions, 244 deletions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f179ada..16a5468 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,6 +59,10 @@ jobs:
       run: cargo test --features=rayon,traits-preview,zeroize
       env:
         RAYON_NUM_THREADS: 1
+    # The mmap feature by itself (update_mmap_rayon is omitted).
+    - run: cargo test --features=mmap
+    # All public features put together.
+    - run: cargo test --features=mmap,rayon,traits-preview,zeroize
     # no_std tests.
     - run: cargo test --no-default-features
 
diff --git a/Cargo.toml b/Cargo.toml
index 74aed30..d92fc6e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,11 +25,17 @@ neon = []
 # entire build, with e.g. RUSTFLAGS="-C target-cpu=native".
 std = []
 
-# The "rayon" feature (defined below as an optional dependency) enables the
-# `Hasher::update_rayon` method, for multithreaded hashing. However, even if
-# this feature is enabled, all other APIs remain single-threaded.
+# The `rayon` feature (disabled by default, but enabled for docs.rs) adds the
+# `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon`
+# methods, for multithreaded hashing. However, even if this feature is enabled,
+# all other APIs remain single-threaded.
 rayon = ["dep:rayon", "std"]
 
+# The `mmap` feature (disabled by default, but enabled for docs.rs) adds the
+# `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon`
+# helper methods for memory-mapped IO.
+mmap = ["std", "dep:memmap2"]
+
 # Implement the zeroize::Zeroize trait for types in this crate.
 zeroize = ["dep:zeroize", "arrayvec/zeroize"]
 
@@ -81,11 +87,9 @@ no_avx2 = []
 no_avx512 = []
 no_neon = []
 
-file = ["memmap2", "rayon", "std"]
-
 [package.metadata.docs.rs]
-# Document Hasher::update_rayon on docs.rs.
-features = ["rayon", "zeroize"]
+# Document the rayon/mmap methods and the Zeroize impls on docs.rs.
+features = ["mmap", "rayon", "zeroize"]
 
 [dependencies]
 arrayref = "0.3.5"
@@ -98,12 +102,13 @@ zeroize = { version = "1", default-features = false, features = ["zeroize_derive
 memmap2 = { version = "0.7.1", optional = true }
 
 [dev-dependencies]
+hmac = "0.12.0"
 hex = "0.4.2"
 page_size = "0.6.0"
 rand = "0.8.0"
 rand_chacha = "0.3.0"
 reference_impl = { path = "./reference_impl" }
-hmac = "0.12.0"
+tempfile = "3.8.0"
 
 [build-dependencies]
 cc = "1.0.4"
diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock
index 3c7c737..d76d175 100644
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@@ -18,9 +18,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea"
+checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
 
 [[package]]
 name = "anstyle-parse"
@@ -131,9 +131,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "clap"
-version = "4.4.2"
+version = "4.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a13b88d2c62ff462f88e4a121f17a82c1af05693a2f192b5c38d14de73c19f6"
+checksum = "84ed82781cea27b43c9b106a979fe450a13a31aab0500595fb3fc06616de08e6"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -307,9 +307,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.148"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b"
 
 [[package]]
 name = "linux-raw-sys"
@@ -369,9 +369,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.67"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328"
 dependencies = [
  "unicode-ident",
 ]
@@ -467,9 +467,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
 name = "syn"
-version = "2.0.32"
+version = "2.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
+checksum = "91e02e55d62894af2a08aca894c6577281f76769ba47c94d5756bec8ac6e7373"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -501,9 +501,9 @@ dependencies = [
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.11"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "utf8parse"
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index 19b617e..dcd9f96 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -15,7 +15,7 @@ pure = ["blake3/pure"]
 
 [dependencies]
 anyhow = "1.0.25"
-blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
+blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
 memmap2 = "0.7.0"
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index 165c579..228737f 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -163,73 +163,22 @@ impl Args {
     }
 }
 
-enum Input {
-    Mmap(io::Cursor<memmap2::Mmap>),
-    File(File),
-    Stdin,
-}
-
-impl Input {
-    // Open an input file, using mmap if appropriate. "-" means stdin. Note
-    // that this convention applies both to command line arguments, and to
-    // filepaths that appear in a checkfile.
-    fn open(path: &Path, args: &Args) -> Result<Self> {
-        if path == Path::new("-") {
-            if args.keyed() {
-                bail!("Cannot open `-` in keyed mode");
-            }
-            return Ok(Self::Stdin);
-        }
-        let file = File::open(path)?;
-        if !args.no_mmap() {
-            if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
-                return Ok(Self::Mmap(io::Cursor::new(mmap)));
-            }
-        }
-        Ok(Self::File(file))
-    }
-
-    fn hash(&mut self, args: &Args) -> Result<blake3::OutputReader> {
-        let mut hasher = args.base_hasher.clone();
-        match self {
-            // The fast path: If we mmapped the file successfully, hash using
-            // multiple threads. This doesn't work on stdin, or on some files,
-            // and it can also be disabled with --no-mmap.
-            Self::Mmap(cursor) => {
-                hasher.update_rayon(cursor.get_ref());
-            }
-            // The slower paths, for stdin or files we didn't/couldn't mmap.
-            // This is currently all single-threaded. Doing multi-threaded
-            // hashing without memory mapping is tricky, since all your worker
-            // threads have to stop every time you refill the buffer, and that
-            // ends up being a lot of overhead. To solve that, we need a more
-            // complicated double-buffering strategy where a background thread
-            // fills one buffer while the worker threads are hashing the other
-            // one. We might implement that in the future, but since this is
-            // the slow path anyway, it's not high priority.
-            Self::File(file) => {
-                blake3::copy_wide(file, &mut hasher)?;
-            }
-            Self::Stdin => {
-                let stdin = io::stdin();
-                let lock = stdin.lock();
-                blake3::copy_wide(lock, &mut hasher)?;
-            }
-        }
-        let mut output_reader = hasher.finalize_xof();
-        output_reader.set_position(args.seek());
-        Ok(output_reader)
-    }
-}
-
-impl Read for Input {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Mmap(cursor) => cursor.read(buf),
-            Self::File(file) => file.read(buf),
-            Self::Stdin => io::stdin().read(buf),
+fn hash_path(args: &Args, path: &Path) -> Result<blake3::OutputReader> {
+    let mut hasher = args.base_hasher.clone();
+    if path == Path::new("-") {
+        if args.keyed() {
+            bail!("Cannot open `-` in keyed mode");
         }
+        hasher.update_reader(io::stdin().lock())?;
+    } else if args.no_mmap() {
+        hasher.update_reader(File::open(path)?)?;
+    } else {
+        // The fast path: Try to mmap the file and hash it with multiple threads.
+        hasher.update_mmap_rayon(path)?;
     }
+    let mut output_reader = hasher.finalize_xof();
+    output_reader.set_position(args.seek());
+    Ok(output_reader)
 }
 
 fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
@@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result<ParsedCheckLine> {
 }
 
 fn hash_one_input(path: &Path, args: &Args) -> Result<()> {
-    let mut input = Input::open(path, args)?;
-    let output = input.hash(args)?;
+    let output = hash_path(args, path)?;
     if args.raw() {
         write_raw_output(output, args)?;
         return Ok(());
@@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool {
     } else {
         file_string
     };
-    let hash_result: Result<blake3::Hash> = Input::open(&file_path, args)
-        .and_then(|mut input| input.hash(args))
-        .map(|mut hash_output| {
+    let found_hash: blake3::Hash;
+    match hash_path(args, &file_path) {
+        Ok(mut output) => {
             let mut found_hash_bytes = [0; blake3::OUT_LEN];
-            hash_output.fill(&mut found_hash_bytes);
-            found_hash_bytes.into()
-        });
-    let found_hash: blake3::Hash = match hash_result {
-        Ok(hash) => hash,
+            output.fill(&mut found_hash_bytes);
+            found_hash = found_hash_bytes.into();
+        }
         Err(e) => {
             println!("{}: FAILED ({})", file_string, e);
             return false;
@@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool {
 }
 
 fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> {
-    let checkfile_input = Input::open(path, args)?;
-    let mut bufreader = io::BufReader::new(checkfile_input);
+    let mut file;
+    let stdin;
+    let mut stdin_lock;
+    let mut bufreader: io::BufReader<&mut dyn Read>;
+    if path == Path::new("-") {
+        stdin = io::stdin();
+        stdin_lock = stdin.lock();
+        bufreader = io::BufReader::new(&mut stdin_lock);
+    } else {
+        file = File::open(path)?;
+        bufreader = io::BufReader::new(&mut file);
+    }
     let mut line = String::new();
     loop {
         line.clear();
diff --git a/src/file.rs b/src/file.rs
deleted file mode 100644
index 81ccbbe..0000000
--- a/src/file.rs
+++ /dev/null
@@ -1,67 +0,0 @@
-//! The file-related utilities.
-//!
-//! # Examples
-//!
-//! ```no_run
-//! use std::io;
-//!
-//! use blake3::file::hash_path_maybe_mmap;
-//!
-//! fn main() -> io::Result<()> {
-//!     let args: Vec<_> = std::env::args_os().collect();
-//!     assert_eq!(args.len(), 2);
-//!     let path = &args[1];
-//!     let mut hasher = blake3::Hasher::new();
-//!     hash_path_maybe_mmap(&mut hasher, path)?;
-//!     println!("{}", hasher.finalize());
-//!     Ok(())
-//! }
-//! ```
-
-use std::{fs::File, io, path::Path};
-
-/// Mmap a file, if it looks like a good idea. Return None in cases where we
-/// know mmap will fail, or if the file is short enough that mmapping isn't
-/// worth it. However, if we do try to mmap and it fails, return the error.
-pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
-    let metadata = file.metadata()?;
-    let file_size = metadata.len();
-    #[allow(clippy::if_same_then_else)]
-    if !metadata.is_file() {
-        // Not a real file.
-        Ok(None)
-    } else if file_size > isize::max_value() as u64 {
-        // Too long to safely map.
-        // https://github.com/danburkert/memmap-rs/issues/69
-        Ok(None)
-    } else if file_size == 0 {
-        // Mapping an empty file currently fails.
-        // https://github.com/danburkert/memmap-rs/issues/72
-        Ok(None)
-    } else if file_size < 16 * 1024 {
-        // Mapping small files is not worth it.
-        Ok(None)
-    } else {
-        // Explicitly set the length of the memory map, so that filesystem
-        // changes can't race to violate the invariants we just checked.
-        let map = unsafe {
-            memmap2::MmapOptions::new()
-                .len(file_size as usize)
-                .map(file)?
-        };
-        Ok(Some(map))
-    }
-}
-
-/// Hash a file fast.
-///
-/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
-pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
-    let file = File::open(path.as_ref())?;
-    if let Some(mmap) = maybe_memmap_file(&file)? {
-        hasher.update_rayon(&mmap);
-    } else {
-        crate::copy_wide(&file, hasher)?;
-    }
-    Ok(())
-}
diff --git a/src/io.rs b/src/io.rs
new file mode 100644
index 0000000..1c19881
--- /dev/null
+++ b/src/io.rs
@@ -0,0 +1,79 @@
+//! Helper functions for efficient IO.
+
+#[cfg(feature = "std")]
+pub(crate) fn copy_wide(
+    mut reader: impl std::io::Read,
+    hasher: &mut crate::Hasher,
+) -> std::io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            // see test_update_reader_interrupted
+            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
+// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
+// fails, return the error.
+//
+// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
+// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get
+// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because
+// this function is crate-private, we can guarantee that all can ever happen in the event of a race
+// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of
+// which should risk memory corruption in a safe caller.
+//
+// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no
+// platform in the "real world" is ever going to do anything other than compute the "wrong answer"
+// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this?
+// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a
+// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have
+// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do
+// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to
+// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert
+// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give
+// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's
+// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to
+// construct a safe &i32 to the register if you're going to leak that reference to unknown callers.
+// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally
+// different here. Feedback needed.
+#[cfg(feature = "mmap")]
+pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> {
+    let metadata = file.metadata()?;
+    let file_size = metadata.len();
+    #[allow(clippy::if_same_then_else)]
+    if !metadata.is_file() {
+        // Not a real file.
+        Ok(None)
+    } else if file_size > isize::max_value() as u64 {
+        // Too long to safely map.
+        // https://github.com/danburkert/memmap-rs/issues/69
+        Ok(None)
+    } else if file_size == 0 {
+        // Mapping an empty file currently fails.
+        // https://github.com/danburkert/memmap-rs/issues/72
+        // See test_mmap_virtual_file.
+        Ok(None)
+    } else if file_size < 16 * 1024 {
+        // Mapping small files is not worth it.
+        Ok(None)
+    } else {
+        // Explicitly set the length of the memory map, so that filesystem
+        // changes can't race to violate the invariants we just checked.
+        let map = unsafe {
+            memmap2::MmapOptions::new()
+                .len(file_size as usize)
+                .map(file)?
+        };
+        Ok(Some(map))
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index b262380..1a2d22d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,18 +33,28 @@
 //! # Cargo Features
 //!
 //! The `std` feature (the only feature enabled by default) is required for
-//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime
-//! CPU feature detection on x86. If this feature is disabled, the only way to
-//! use the x86 SIMD implementations is to enable the corresponding instruction
-//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting
-//! binary will not be portable to other machines.
+//! implementations of the [`Write`] and [`Seek`] traits, the
+//! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU
+//! feature detection on x86. If this feature is disabled, the only way to use
+//! the x86 SIMD implementations is to enable the corresponding instruction sets
+//! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary
+//! will not be portable to other machines.
 //!
 //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds
-//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However,
-//! even if this feature is enabled, all other APIs remain single-threaded.
+//! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap`
+//! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for
+//! multithreaded hashing. However, even if this feature is enabled, all other
+//! APIs remain single-threaded.
 //!
-//! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) implements
-//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for this crate's types.
+//! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the
+//! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above)
+//! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for
+//! memory-mapped IO.
+//!
+//! The `zeroize` feature (disabled by default, but enabled for [docs.rs])
+//! implements
+//! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for
+//! this crate's types.
 //!
 //! The NEON implementation is enabled by default for AArch64 but requires the
 //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and
@@ -52,12 +62,12 @@
 //! without NEON support.
 //!
 //! The `traits-preview` feature enables implementations of traits from the
-//! RustCrypto [`digest`] crate, and re-exports that crate as
-//! `traits::digest`. However, the traits aren't stable, and they're expected to
-//! change in incompatible ways before that crate reaches 1.0. For that reason,
-//! this crate makes no SemVer guarantees for this feature, and callers who use
-//! it should expect breaking changes between patch versions. (The "-preview"
-//! feature name follows the conventions of the RustCrypto [`signature`] crate.)
+//! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`.
+//! However, the traits aren't stable, and they're expected to change in
+//! incompatible ways before that crate reaches 1.0. For that reason, this crate
+//! makes no SemVer guarantees for this feature, and callers who use it should
+//! expect breaking changes between patch versions. (The "-preview" feature name
+//! follows the conventions of the RustCrypto [`signature`] crate.)
 //!
 //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon
 //! [BLAKE3]: https://blake3.io
@@ -115,9 +125,7 @@ mod sse41;
 #[cfg(feature = "traits-preview")]
 pub mod traits;
 
-#[cfg(feature = "file")]
-pub mod file;
-
+mod io;
 mod join;
 
 use arrayref::{array_mut_ref, array_ref};
@@ -914,6 +922,15 @@ fn parent_node_output(
 
 /// An incremental hash state that can accept any number of writes.
 ///
+/// **Performance note:** The [`update`](#method.update) method can't take full
+/// advantage of SIMD optimizations if its input buffer is too small or oddly
+/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently
+/// supported SIMD instruction sets. See also
+/// [`update_reader`](Hasher::update_reader).
+///
+/// The `rayon` and `mmap` Cargo features enable additional methods on this
+/// type related to multithreading and memory-mapped IO.
+///
 /// When the `traits-preview` Cargo feature is enabled, this type implements
 /// several commonly used traits from the
 /// [`digest`](https://crates.io/crates/digest) crate. However, those
@@ -922,15 +939,6 @@ fn parent_node_output(
 /// guarantees for this feature, and callers who use it should expect breaking
 /// changes between patch versions.
 ///
-/// When the `rayon` Cargo feature is enabled, the
-/// [`update_rayon`](#method.update_rayon) method is available for multithreaded
-/// hashing.
-///
-/// **Performance note:** The [`update`](#method.update) method can't take full
-/// advantage of SIMD optimizations if its input buffer is too small or oddly
-/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently
-/// supported SIMD instruction sets.
-///
 /// # Examples
 ///
 /// ```
@@ -1099,30 +1107,6 @@ impl Hasher {
         self.update_with_join::<join::SerialJoin>(input)
     }
 
-    /// Identical to [`update`](Hasher::update), but using Rayon-based
-    /// multithreading internally.
-    ///
-    /// This method is gated by the `rayon` Cargo feature, which is disabled by
-    /// default but enabled on [docs.rs](https://docs.rs).
-    ///
-    /// To get any performance benefit from multithreading, the input buffer
-    /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
-    /// _slower_ than `update` for inputs under 128 KiB. That threshold varies
-    /// quite a lot across different processors, and it's important to benchmark
-    /// your specific use case.
-    ///
-    /// Memory mapping an entire input file is a simple way to take advantage of
-    /// multithreading without needing to carefully tune your buffer size or
-    /// offload IO. However, on spinning disks where random access is expensive,
-    /// that approach can lead to disk thrashing and terrible IO performance.
-    /// Note that OS page caching can mask this problem, in which case it might
-    /// only appear for files larger than available RAM. Again, benchmarking
-    /// your specific use case is important.
-    #[cfg(feature = "rayon")]
-    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
-        self.update_with_join::<join::RayonJoin>(input)
-    }
-
     fn update_with_join<J: join::Join>(&mut self, mut input: &[u8]) -> &mut Self {
         // If we have some partial chunk bytes in the internal chunk_state, we
         // need to finish that chunk first.
@@ -1321,6 +1305,179 @@ impl Hasher {
     pub fn count(&self) -> u64 {
         self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64
     }
+
+    /// As [`update`](Hasher::update), but reading from a
+    /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation.
+    ///
+    /// [`Hasher`] implements
+    /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to
+    /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`]
+    /// from any reader. Unfortunately, this standard approach can limit performance, because the
+    /// [`copy`](https://doc.rust-lang.org/std/io/fn.copy.html) function currently uses an internal
+    /// 8 KiB buffer that isn't big enough to take advantage of all SIMD instruction sets. (In
+    ///   particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) needs a
+    /// 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly more
+    ///    convenient.
+    ///
+    /// The internal buffer size this method uses may change at any time, and it may be different
+    /// for different targets. The only guarantee is that it will be large enough for all of this
+    /// crate's SIMD implementations on the current platform.
+    ///
+    /// The most common implementer of
+    /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be
+    /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory
+    /// mapping can be faster than this method for hashing large files. See
+    /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon),
+    /// which require the `mmap` and (for the latter) `rayon` Cargo features.
+    ///
+    /// This method requires the `std` Cargo feature, which is enabled by default.
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::fs::File;
+    /// # use std::io;
+    /// # fn main() -> io::Result<()> {
+    /// // Hash standard input.
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_reader(std::io::stdin().lock())?;
+    /// println!("{}", hasher.finalize());
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "std")]
+    pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<u64> {
+        io::copy_wide(reader, self)
+    }
+
+    /// As [`update`](Hasher::update), but using Rayon-based multithreading
+    /// internally.
+    ///
+    /// This method is gated by the `rayon` Cargo feature, which is disabled by
+    /// default but enabled on [docs.rs](https://docs.rs).
+    ///
+    /// To get any performance benefit from multithreading, the input buffer
+    /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is
+    /// _slower_ than `update` for inputs under 128 KiB. That threshold varies
+    /// quite a lot across different processors, and it's important to benchmark
+    /// your specific use case. See also the performance warning associated with
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon).
+    ///
+    /// If you already have a large buffer in memory, and you want to hash it
+    /// with multiple threads, this method is a good option. However, reading a
+    /// file into memory just to call this method can be a performance mistake,
+    /// both because it requires lots of memory and because single-threaded
+    /// reads can be slow. For hashing whole files, see
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both
+    /// the `rayon` and `mmap` Cargo features.
+    #[cfg(feature = "rayon")]
+    pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self {
+        self.update_with_join::<join::RayonJoin>(input)
+    }
+
+    /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping.
+    ///
+    /// Not all files can be memory mapped, and memory mapping small files can be slower than
+    /// reading them the usual way. In those cases, this method will fall back to standard file IO.
+    /// The heuristic for whether to use memory mapping is currently very simple (file size >=
+    /// 16 KiB), and it might change at any time.
+    ///
+    /// Like [`update`](Hasher::update), this method is single-threaded. In this author's
+    /// experience, memory mapping improves single-threaded performance by ~10% for large files
+    /// that are already in cache. This probably varies between platforms, and as always it's a
+    /// good idea to benchmark your own use case. In comparison, the multithreaded
+    /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on
+    /// performance.
+    ///
+    /// There's a correctness reason that this method takes
+    /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of
+    /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped
+    /// file ignores the seek position of the original file handle (it neither respects the current
+    /// position nor updates the position). This difference in behavior would've caused
+    /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and
+    /// have different side effects in some cases. Taking a
+    /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by
+    /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is
+    /// opened internally.
+    ///
+    /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on
+    /// [docs.rs](https://docs.rs).
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::io;
+    /// # use std::path::Path;
+    /// # fn main() -> io::Result<()> {
+    /// let path = Path::new("file.dat");
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_mmap(path)?;
+    /// println!("{}", hasher.finalize());
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "mmap")]
+    pub fn update_mmap(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<()> {
+        let file = std::fs::File::open(path.as_ref())?;
+        if let Some(mmap) = io::maybe_mmap_file(&file)? {
+            self.update(&mmap);
+        } else {
+            io::copy_wide(&file, self)?;
+        }
+        Ok(())
+    }
+
+    /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using
+    /// memory mapping. This is the default behavior of `b3sum`.
+    ///
+    /// For large files that are likely to be in cache, this can be much faster than
+    /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other
+    /// cryptographic hashes, this is usually what they're measuring. However...
+    ///
+    /// **Performance Warning:** There are cases where multithreading hurts performance. The worst
+    /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31),
+    /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends
+    /// more time seeking around than reading data). Windows tends to be somewhat worse about this,
+    /// in part because it's less likely than Linux to keep very large files in cache. More
+    /// generally, if your CPU cores are already busy, then multithreading will add overhead
+    /// without improving performance. If your code runs in different environments that you don't
+    /// control and can't measure, then unfortunately there's no one-size-fits-all answer for
+    /// whether multithreading is a good idea.
+    ///
+    /// The memory mapping behavior of this function is the same as
+    /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard
+    /// file IO might change at any time.
+    ///
+    /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by
+    /// default but enabled on [docs.rs](https://docs.rs).
+    ///
+    /// # Example
+    ///
+    /// ```no_run
+    /// # use std::io;
+    /// # use std::path::Path;
+    /// # fn main() -> io::Result<()> {
+    /// # #[cfg(feature = "rayon")]
+    /// # {
+    /// let path = Path::new("big_file.dat");
+    /// let mut hasher = blake3::Hasher::new();
+    /// hasher.update_mmap_rayon(path)?;
+    /// println!("{}", hasher.finalize());
+    /// # }
+    /// # Ok(())
+    /// # }
+    /// ```
+    #[cfg(feature = "mmap")]
+    #[cfg(feature = "rayon")]
+    pub fn update_mmap_rayon(&mut self, path: impl AsRef<std::path::Path>) -> std::io::Result<()> {
+        let file = std::fs::File::open(path.as_ref())?;
+        if let Some(mmap) = io::maybe_mmap_file(&file)? {
+            self.update_rayon(&mmap);
+        } else {
+            io::copy_wide(&file, self)?;
+        }
+        Ok(())
+    }
 }
 
 // Don't derive(Debug), because the state may be secret.
@@ -1355,29 +1512,6 @@ impl std::io::Write for Hasher {
     }
 }
 
-/// Copy from `reader` to `hasher`, returning the number of bytes read.
-///
-/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
-/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
-/// can support at least 64 KiB, and there's some performance benefit to using
-/// bigger reads, so that's what we use here.
-#[cfg(feature = "std")]
-pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
-    let mut buffer = [0; 65536];
-    let mut total = 0;
-    loop {
-        match reader.read(&mut buffer) {
-            Ok(0) => return Ok(total),
-            Ok(n) => {
-                hasher.update(&buffer[..n]);
-                total += n as u64;
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
-            Err(e) => return Err(e),
-        }
-    }
-}
-
 /// An incremental reader for extended output, returned by
 /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
 ///
diff --git a/src/test.rs b/src/test.rs
index 0d94f44..a319930 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -658,11 +658,13 @@ fn test_zeroize() {
     assert_eq!(hasher.chunk_state.buf_len, 0);
     assert_eq!(hasher.chunk_state.blocks_compressed, 0);
     assert_eq!(hasher.chunk_state.flags, 0);
-    assert!(matches!(hasher.chunk_state.platform, crate::Platform::Portable));
+    assert!(matches!(
+        hasher.chunk_state.platform,
+        crate::Platform::Portable
+    ));
     assert_eq!(hasher.key, [0; 8]);
     assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
 
-
     let mut output_reader = crate::OutputReader {
         inner: crate::Output {
             input_chaining_value: [42; 8],
@@ -675,14 +677,123 @@ fn test_zeroize() {
         position_within_block: 42,
     };
 
-
     output_reader.zeroize();
     assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
     assert_eq!(output_reader.inner.block, [0; 64]);
     assert_eq!(output_reader.inner.counter, 0);
     assert_eq!(output_reader.inner.block_len, 0);
     assert_eq!(output_reader.inner.flags, 0);
-    assert!(matches!(output_reader.inner.platform, crate::Platform::Portable));
+    assert!(matches!(
+        output_reader.inner.platform,
+        crate::Platform::Portable
+    ));
     assert_eq!(output_reader.position_within_block, 0);
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_update_reader() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_reader() is mostly a wrapper around update(), which already
+    // has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    let mut hasher = crate::Hasher::new();
+    hasher.update_reader(std::fs::File::open(tempfile.path())?)?;
+    assert_eq!(hasher.finalize(), crate::hash(&input));
+    Ok(())
+}
 
-}
-\ No newline at end of file
+#[test]
+#[cfg(feature = "std")]
+fn test_update_reader_interrupted() -> std::io::Result<()> {
+    use std::io;
+    struct InterruptingReader<'a> {
+        already_interrupted: bool,
+        slice: &'a [u8],
+    }
+    impl<'a> InterruptingReader<'a> {
+        fn new(slice: &'a [u8]) -> Self {
+            Self {
+                already_interrupted: false,
+                slice,
+            }
+        }
+    }
+    impl<'a> io::Read for InterruptingReader<'a> {
+        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+            if !self.already_interrupted {
+                self.already_interrupted = true;
+                return Err(io::Error::from(io::ErrorKind::Interrupted));
+            }
+            let take = std::cmp::min(self.slice.len(), buf.len());
+            buf[..take].copy_from_slice(&self.slice[..take]);
+            self.slice = &self.slice[take..];
+            Ok(take)
+        }
+    }
+
+    let input = b"hello world";
+    let mut reader = InterruptingReader::new(input);
+    let mut hasher = crate::Hasher::new();
+    hasher.update_reader(&mut reader)?;
+    assert_eq!(hasher.finalize(), crate::hash(input));
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+fn test_mmap() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already
+    // has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    let mut hasher = crate::Hasher::new();
+    hasher.update_mmap(tempfile.path())?;
+    assert_eq!(hasher.finalize(), crate::hash(&input));
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+#[cfg(target_os = "linux")]
+fn test_mmap_virtual_file() -> Result<(), std::io::Error> {
+    // Virtual files like /proc/version can't be mmapped, because their contents don't actually
+    // exist anywhere in memory. Make sure we fall back to regular file IO in these cases.
+    // Currently this is handled with a length check, where the assumption is that virtual files
+    // will always report length 0. If that assumption ever breaks, hopefully this test will catch
+    // it.
+    let virtual_filepath = "/proc/version";
+    let mut mmap_hasher = crate::Hasher::new();
+    // We'll fail right here if the fallback doesn't work.
+    mmap_hasher.update_mmap(virtual_filepath)?;
+    let mut read_hasher = crate::Hasher::new();
+    read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?;
+    assert_eq!(mmap_hasher.finalize(), read_hasher.finalize());
+    Ok(())
+}
+
+#[test]
+#[cfg(feature = "mmap")]
+#[cfg(feature = "rayon")]
+fn test_mmap_rayon() -> Result<(), std::io::Error> {
+    // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(),
+    // which already has substantial testing.
+    use std::io::prelude::*;
+    let mut input = vec![0; 1_000_000];
+    paint_test_input(&mut input);
+    let mut tempfile = tempfile::NamedTempFile::new()?;
+    tempfile.write_all(&input)?;
+    tempfile.flush()?;
+    let mut hasher = crate::Hasher::new();
+    hasher.update_mmap_rayon(tempfile.path())?;
+    assert_eq!(hasher.finalize(), crate::hash(&input));
+    Ok(())
+}
author	Jack O'Connor <[email protected]>	2023-09-10 15:18:11 -0700
committer	Jack O'Connor <[email protected]>	2023-09-16 17:04:27 -0700
commit	cb32f0bd1450991d86a8ceb3716fd591382cc507 (patch)
tree	f0112db7d1a4452facd92111eb7134a20270d60e
parent	e0bb91564125407102af81e219399025aa2c24b9 (diff)