move file operations from b3sum to blake3

author: Banyc <[email protected]> 2023-07-18 00:48:52 +0800
committer: Jack O'Connor <[email protected]> 2023-09-16 14:20:39 -0700
commit: e0bb91564125407102af81e219399025aa2c24b9 (patch)
tree: f17b2553af54a7bb800753763b0e1f12b0090f84
parent: 12b368541f917d69b6169a4d895e206144acd44f (diff)
6 files changed, 101 insertions, 56 deletions
diff --git a/Cargo.toml b/Cargo.toml
index e9ab95e..74aed30 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,6 +81,8 @@ no_avx2 = []
 no_avx512 = []
 no_neon = []
 
+file = ["memmap2", "rayon", "std"]
+
 [package.metadata.docs.rs]
 # Document Hasher::update_rayon on docs.rs.
 features = ["rayon", "zeroize"]
@@ -93,6 +95,7 @@ rayon = { version = "1.2.1", optional = true }
 cfg-if = "1.0.0"
 digest = { version = "0.10.1", features = [ "mac" ], optional = true }
 zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }
+memmap2 = { version = "0.7.1", optional = true }
 
 [dev-dependencies]
 hex = "0.4.2"
diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock
index d1049af..3c7c737 100644
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@@ -110,6 +110,7 @@ dependencies = [
  "cc",
  "cfg-if",
  "constant_time_eq",
+ "memmap2",
  "rayon",
 ]
 
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index 02c9405..19b617e 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -15,7 +15,7 @@ pure = ["blake3/pure"]
 
 [dependencies]
 anyhow = "1.0.25"
-blake3 = { version = "1", path = "..", features = ["rayon"] }
+blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
 clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
 hex = "0.4.0"
 memmap2 = "0.7.0"
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index fd35f68..165c579 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -182,7 +182,7 @@ impl Input {
         }
         let file = File::open(path)?;
         if !args.no_mmap() {
-            if let Some(mmap) = maybe_memmap_file(&file)? {
+            if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
                 return Ok(Self::Mmap(io::Cursor::new(mmap)));
             }
         }
@@ -208,12 +208,12 @@ impl Input {
             // one. We might implement that in the future, but since this is
             // the slow path anyway, it's not high priority.
             Self::File(file) => {
-                copy_wide(file, &mut hasher)?;
+                blake3::copy_wide(file, &mut hasher)?;
             }
             Self::Stdin => {
                 let stdin = io::stdin();
                 let lock = stdin.lock();
-                copy_wide(lock, &mut hasher)?;
+                blake3::copy_wide(lock, &mut hasher)?;
             }
         }
         let mut output_reader = hasher.finalize_xof();
@@ -232,58 +232,6 @@ impl Read for Input {
     }
 }
 
-// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
-// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
-// can support at least 64 KiB, and there's some performance benefit to using
-// bigger reads, so that's what we use here.
-fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
-    let mut buffer = [0; 65536];
-    let mut total = 0;
-    loop {
-        match reader.read(&mut buffer) {
-            Ok(0) => return Ok(total),
-            Ok(n) => {
-                hasher.update(&buffer[..n]);
-                total += n as u64;
-            }
-            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
-            Err(e) => return Err(e),
-        }
-    }
-}
-
-// Mmap a file, if it looks like a good idea. Return None in cases where we
-// know mmap will fail, or if the file is short enough that mmapping isn't
-// worth it. However, if we do try to mmap and it fails, return the error.
-fn maybe_memmap_file(file: &File) -> Result<Option<memmap2::Mmap>> {
-    let metadata = file.metadata()?;
-    let file_size = metadata.len();
-    Ok(if !metadata.is_file() {
-        // Not a real file.
-        None
-    } else if file_size > isize::max_value() as u64 {
-        // Too long to safely map.
-        // https://github.com/danburkert/memmap-rs/issues/69
-        None
-    } else if file_size == 0 {
-        // Mapping an empty file currently fails.
-        // https://github.com/danburkert/memmap-rs/issues/72
-        None
-    } else if file_size < 16 * 1024 {
-        // Mapping small files is not worth it.
-        None
-    } else {
-        // Explicitly set the length of the memory map, so that filesystem
-        // changes can't race to violate the invariants we just checked.
-        let map = unsafe {
-            memmap2::MmapOptions::new()
-                .len(file_size as usize)
-                .map(file)?
-        };
-        Some(map)
-    })
-}
-
 fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
     // Encoding multiples of the 64 bytes is most efficient.
     // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
diff --git a/src/file.rs b/src/file.rs
new file mode 100644
index 0000000..81ccbbe
--- /dev/null
+++ b/src/file.rs
@@ -0,0 +1,67 @@
+//! The file-related utilities.
+//!
+//! # Examples
+//!
+//! ```no_run
+//! use std::io;
+//!
+//! use blake3::file::hash_path_maybe_mmap;
+//!
+//! fn main() -> io::Result<()> {
+//!     let args: Vec<_> = std::env::args_os().collect();
+//!     assert_eq!(args.len(), 2);
+//!     let path = &args[1];
+//!     let mut hasher = blake3::Hasher::new();
+//!     hash_path_maybe_mmap(&mut hasher, path)?;
+//!     println!("{}", hasher.finalize());
+//!     Ok(())
+//! }
+//! ```
+
+use std::{fs::File, io, path::Path};
+
+/// Mmap a file, if it looks like a good idea. Return None in cases where we
+/// know mmap will fail, or if the file is short enough that mmapping isn't
+/// worth it. However, if we do try to mmap and it fails, return the error.
+pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
+    let metadata = file.metadata()?;
+    let file_size = metadata.len();
+    #[allow(clippy::if_same_then_else)]
+    if !metadata.is_file() {
+        // Not a real file.
+        Ok(None)
+    } else if file_size > isize::max_value() as u64 {
+        // Too long to safely map.
+        // https://github.com/danburkert/memmap-rs/issues/69
+        Ok(None)
+    } else if file_size == 0 {
+        // Mapping an empty file currently fails.
+        // https://github.com/danburkert/memmap-rs/issues/72
+        Ok(None)
+    } else if file_size < 16 * 1024 {
+        // Mapping small files is not worth it.
+        Ok(None)
+    } else {
+        // Explicitly set the length of the memory map, so that filesystem
+        // changes can't race to violate the invariants we just checked.
+        let map = unsafe {
+            memmap2::MmapOptions::new()
+                .len(file_size as usize)
+                .map(file)?
+        };
+        Ok(Some(map))
+    }
+}
+
+/// Hash a file fast.
+///
+/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
+pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
+    let file = File::open(path.as_ref())?;
+    if let Some(mmap) = maybe_memmap_file(&file)? {
+        hasher.update_rayon(&mmap);
+    } else {
+        crate::copy_wide(&file, hasher)?;
+    }
+    Ok(())
+}
diff --git a/src/lib.rs b/src/lib.rs
index 52971b7..b262380 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,6 +115,9 @@ mod sse41;
 #[cfg(feature = "traits-preview")]
 pub mod traits;
 
+#[cfg(feature = "file")]
+pub mod file;
+
 mod join;
 
 use arrayref::{array_mut_ref, array_ref};
@@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher {
     }
 }
 
+/// Copy from `reader` to `hasher`, returning the number of bytes read.
+///
+/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
+/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
+/// can support at least 64 KiB, and there's some performance benefit to using
+/// bigger reads, so that's what we use here.
+#[cfg(feature = "std")]
+pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
 /// An incremental reader for extended output, returned by
 /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
 ///
author	Banyc <[email protected]>	2023-07-18 00:48:52 +0800
committer	Jack O'Connor <[email protected]>	2023-09-16 14:20:39 -0700
commit	e0bb91564125407102af81e219399025aa2c24b9 (patch)
tree	f17b2553af54a7bb800753763b0e1f12b0090f84
parent	12b368541f917d69b6169a4d895e206144acd44f (diff)