diff options
| author | Banyc <[email protected]> | 2023-07-18 00:48:52 +0800 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2023-09-16 14:20:39 -0700 |
| commit | e0bb91564125407102af81e219399025aa2c24b9 (patch) | |
| tree | f17b2553af54a7bb800753763b0e1f12b0090f84 /src | |
| parent | 12b368541f917d69b6169a4d895e206144acd44f (diff) | |
move file operations from b3sum to blake3
Diffstat (limited to 'src')
| -rw-r--r-- | src/file.rs | 67 | ||||
| -rw-r--r-- | src/lib.rs | 26 |
2 files changed, 93 insertions, 0 deletions
diff --git a/src/file.rs b/src/file.rs new file mode 100644 index 0000000..81ccbbe --- /dev/null +++ b/src/file.rs @@ -0,0 +1,67 @@ +//! The file-related utilities. +//! +//! # Examples +//! +//! ```no_run +//! use std::io; +//! +//! use blake3::file::hash_path_maybe_mmap; +//! +//! fn main() -> io::Result<()> { +//! let args: Vec<_> = std::env::args_os().collect(); +//! assert_eq!(args.len(), 2); +//! let path = &args[1]; +//! let mut hasher = blake3::Hasher::new(); +//! hash_path_maybe_mmap(&mut hasher, path)?; +//! println!("{}", hasher.finalize()); +//! Ok(()) +//! } +//! ``` + +use std::{fs::File, io, path::Path}; + +/// Mmap a file, if it looks like a good idea. Return None in cases where we +/// know mmap will fail, or if the file is short enough that mmapping isn't +/// worth it. However, if we do try to mmap and it fails, return the error. +pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + #[allow(clippy::if_same_then_else)] + if !metadata.is_file() { + // Not a real file. + Ok(None) + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + Ok(None) + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + Ok(None) + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + Ok(None) + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap2::MmapOptions::new() + .len(file_size as usize) + .map(file)? + }; + Ok(Some(map)) + } +} + +/// Hash a file fast. +/// +/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer. +pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> { + let file = File::open(path.as_ref())?; + if let Some(mmap) = maybe_memmap_file(&file)? { + hasher.update_rayon(&mmap); + } else { + crate::copy_wide(&file, hasher)?; + } + Ok(()) +} @@ -115,6 +115,9 @@ mod sse41; #[cfg(feature = "traits-preview")] pub mod traits; +#[cfg(feature = "file")] +pub mod file; + mod join; use arrayref::{array_mut_ref, array_ref}; @@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher { } } +/// Copy from `reader` to `hasher`, returning the number of bytes read. +/// +/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets +/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms +/// can support at least 64 KiB, and there's some performance benefit to using +/// bigger reads, so that's what we use here. +#[cfg(feature = "std")] +pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + /// An incremental reader for extended output, returned by /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). /// |
