move file operations from b3sum to blake3

author: Banyc <[email protected]> 2023-07-18 00:48:52 +0800
committer: Jack O'Connor <[email protected]> 2023-09-16 14:20:39 -0700
commit: e0bb91564125407102af81e219399025aa2c24b9 (patch)
tree: f17b2553af54a7bb800753763b0e1f12b0090f84 /src
parent: 12b368541f917d69b6169a4d895e206144acd44f (diff)
2 files changed, 93 insertions, 0 deletions
diff --git a/src/file.rs b/src/file.rs
new file mode 100644
index 0000000..81ccbbe
--- /dev/null
+++ b/src/file.rs
@@ -0,0 +1,67 @@
+//! The file-related utilities.
+//!
+//! # Examples
+//!
+//! ```no_run
+//! use std::io;
+//!
+//! use blake3::file::hash_path_maybe_mmap;
+//!
+//! fn main() -> io::Result<()> {
+//!     let args: Vec<_> = std::env::args_os().collect();
+//!     assert_eq!(args.len(), 2);
+//!     let path = &args[1];
+//!     let mut hasher = blake3::Hasher::new();
+//!     hash_path_maybe_mmap(&mut hasher, path)?;
+//!     println!("{}", hasher.finalize());
+//!     Ok(())
+//! }
+//! ```
+
+use std::{fs::File, io, path::Path};
+
+/// Mmap a file, if it looks like a good idea. Return None in cases where we
+/// know mmap will fail, or if the file is short enough that mmapping isn't
+/// worth it. However, if we do try to mmap and it fails, return the error.
+pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
+    let metadata = file.metadata()?;
+    let file_size = metadata.len();
+    #[allow(clippy::if_same_then_else)]
+    if !metadata.is_file() {
+        // Not a real file.
+        Ok(None)
+    } else if file_size > isize::max_value() as u64 {
+        // Too long to safely map.
+        // https://github.com/danburkert/memmap-rs/issues/69
+        Ok(None)
+    } else if file_size == 0 {
+        // Mapping an empty file currently fails.
+        // https://github.com/danburkert/memmap-rs/issues/72
+        Ok(None)
+    } else if file_size < 16 * 1024 {
+        // Mapping small files is not worth it.
+        Ok(None)
+    } else {
+        // Explicitly set the length of the memory map, so that filesystem
+        // changes can't race to violate the invariants we just checked.
+        let map = unsafe {
+            memmap2::MmapOptions::new()
+                .len(file_size as usize)
+                .map(file)?
+        };
+        Ok(Some(map))
+    }
+}
+
+/// Hash a file fast.
+///
+/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
+pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
+    let file = File::open(path.as_ref())?;
+    if let Some(mmap) = maybe_memmap_file(&file)? {
+        hasher.update_rayon(&mmap);
+    } else {
+        crate::copy_wide(&file, hasher)?;
+    }
+    Ok(())
+}
diff --git a/src/lib.rs b/src/lib.rs
index 52971b7..b262380 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,6 +115,9 @@ mod sse41;
 #[cfg(feature = "traits-preview")]
 pub mod traits;
 
+#[cfg(feature = "file")]
+pub mod file;
+
 mod join;
 
 use arrayref::{array_mut_ref, array_ref};
@@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher {
     }
 }
 
+/// Copy from `reader` to `hasher`, returning the number of bytes read.
+///
+/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
+/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
+/// can support at least 64 KiB, and there's some performance benefit to using
+/// bigger reads, so that's what we use here.
+#[cfg(feature = "std")]
+pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
 /// An incremental reader for extended output, returned by
 /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
 ///
author	Banyc <[email protected]>	2023-07-18 00:48:52 +0800
committer	Jack O'Connor <[email protected]>	2023-09-16 14:20:39 -0700
commit	e0bb91564125407102af81e219399025aa2c24b9 (patch)
tree	f17b2553af54a7bb800753763b0e1f12b0090f84 /src
parent	12b368541f917d69b6169a4d895e206144acd44f (diff)