aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBanyc <[email protected]>2023-07-18 00:48:52 +0800
committerJack O'Connor <[email protected]>2023-09-16 14:20:39 -0700
commite0bb91564125407102af81e219399025aa2c24b9 (patch)
treef17b2553af54a7bb800753763b0e1f12b0090f84 /src
parent12b368541f917d69b6169a4d895e206144acd44f (diff)
move file operations from b3sum to blake3
Diffstat (limited to 'src')
-rw-r--r--src/file.rs67
-rw-r--r--src/lib.rs26
2 files changed, 93 insertions, 0 deletions
diff --git a/src/file.rs b/src/file.rs
new file mode 100644
index 0000000..81ccbbe
--- /dev/null
+++ b/src/file.rs
@@ -0,0 +1,67 @@
+//! The file-related utilities.
+//!
+//! # Examples
+//!
+//! ```no_run
+//! use std::io;
+//!
+//! use blake3::file::hash_path_maybe_mmap;
+//!
+//! fn main() -> io::Result<()> {
+//! let args: Vec<_> = std::env::args_os().collect();
+//! assert_eq!(args.len(), 2);
+//! let path = &args[1];
+//! let mut hasher = blake3::Hasher::new();
+//! hash_path_maybe_mmap(&mut hasher, path)?;
+//! println!("{}", hasher.finalize());
+//! Ok(())
+//! }
+//! ```
+
+use std::{fs::File, io, path::Path};
+
+/// Mmap a file, if it looks like a good idea. Return None in cases where we
+/// know mmap will fail, or if the file is short enough that mmapping isn't
+/// worth it. However, if we do try to mmap and it fails, return the error.
+pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
+ let metadata = file.metadata()?;
+ let file_size = metadata.len();
+ #[allow(clippy::if_same_then_else)]
+ if !metadata.is_file() {
+ // Not a real file.
+ Ok(None)
+ } else if file_size > isize::max_value() as u64 {
+ // Too long to safely map.
+ // https://github.com/danburkert/memmap-rs/issues/69
+ Ok(None)
+ } else if file_size == 0 {
+ // Mapping an empty file currently fails.
+ // https://github.com/danburkert/memmap-rs/issues/72
+ Ok(None)
+ } else if file_size < 16 * 1024 {
+ // Mapping small files is not worth it.
+ Ok(None)
+ } else {
+ // Explicitly set the length of the memory map, so that filesystem
+ // changes can't race to violate the invariants we just checked.
+ let map = unsafe {
+ memmap2::MmapOptions::new()
+ .len(file_size as usize)
+ .map(file)?
+ };
+ Ok(Some(map))
+ }
+}
+
+/// Hash a file fast.
+///
+/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
+pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
+ let file = File::open(path.as_ref())?;
+ if let Some(mmap) = maybe_memmap_file(&file)? {
+ hasher.update_rayon(&mmap);
+ } else {
+ crate::copy_wide(&file, hasher)?;
+ }
+ Ok(())
+}
diff --git a/src/lib.rs b/src/lib.rs
index 52971b7..b262380 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,6 +115,9 @@ mod sse41;
#[cfg(feature = "traits-preview")]
pub mod traits;
+#[cfg(feature = "file")]
+pub mod file;
+
mod join;
use arrayref::{array_mut_ref, array_ref};
@@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher {
}
}
+/// Copy from `reader` to `hasher`, returning the number of bytes read.
+///
+/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
+/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
+/// can support at least 64 KiB, and there's some performance benefit to using
+/// bigger reads, so that's what we use here.
+#[cfg(feature = "std")]
+pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
+ let mut buffer = [0; 65536];
+ let mut total = 0;
+ loop {
+ match reader.read(&mut buffer) {
+ Ok(0) => return Ok(total),
+ Ok(n) => {
+ hasher.update(&buffer[..n]);
+ total += n as u64;
+ }
+ Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+ Err(e) => return Err(e),
+ }
+ }
+}
+
/// An incremental reader for extended output, returned by
/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
///