aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBanyc <[email protected]>2023-07-18 00:48:52 +0800
committerJack O'Connor <[email protected]>2023-09-16 14:20:39 -0700
commite0bb91564125407102af81e219399025aa2c24b9 (patch)
treef17b2553af54a7bb800753763b0e1f12b0090f84
parent12b368541f917d69b6169a4d895e206144acd44f (diff)
move file operations from b3sum to blake3
-rw-r--r--Cargo.toml3
-rw-r--r--b3sum/Cargo.lock1
-rw-r--r--b3sum/Cargo.toml2
-rw-r--r--b3sum/src/main.rs58
-rw-r--r--src/file.rs67
-rw-r--r--src/lib.rs26
6 files changed, 101 insertions, 56 deletions
diff --git a/Cargo.toml b/Cargo.toml
index e9ab95e..74aed30 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,6 +81,8 @@ no_avx2 = []
no_avx512 = []
no_neon = []
+file = ["memmap2", "rayon", "std"]
+
[package.metadata.docs.rs]
# Document Hasher::update_rayon on docs.rs.
features = ["rayon", "zeroize"]
@@ -93,6 +95,7 @@ rayon = { version = "1.2.1", optional = true }
cfg-if = "1.0.0"
digest = { version = "0.10.1", features = [ "mac" ], optional = true }
zeroize = { version = "1", default-features = false, features = ["zeroize_derive"], optional = true }
+memmap2 = { version = "0.7.1", optional = true }
[dev-dependencies]
hex = "0.4.2"
diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock
index d1049af..3c7c737 100644
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@@ -110,6 +110,7 @@ dependencies = [
"cc",
"cfg-if",
"constant_time_eq",
+ "memmap2",
"rayon",
]
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index 02c9405..19b617e 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -15,7 +15,7 @@ pure = ["blake3/pure"]
[dependencies]
anyhow = "1.0.25"
-blake3 = { version = "1", path = "..", features = ["rayon"] }
+blake3 = { version = "1", path = "..", features = ["file", "rayon"] }
clap = { version = "4.0.8", features = ["derive", "wrap_help"] }
hex = "0.4.0"
memmap2 = "0.7.0"
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index fd35f68..165c579 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -182,7 +182,7 @@ impl Input {
}
let file = File::open(path)?;
if !args.no_mmap() {
- if let Some(mmap) = maybe_memmap_file(&file)? {
+ if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? {
return Ok(Self::Mmap(io::Cursor::new(mmap)));
}
}
@@ -208,12 +208,12 @@ impl Input {
// one. We might implement that in the future, but since this is
// the slow path anyway, it's not high priority.
Self::File(file) => {
- copy_wide(file, &mut hasher)?;
+ blake3::copy_wide(file, &mut hasher)?;
}
Self::Stdin => {
let stdin = io::stdin();
let lock = stdin.lock();
- copy_wide(lock, &mut hasher)?;
+ blake3::copy_wide(lock, &mut hasher)?;
}
}
let mut output_reader = hasher.finalize_xof();
@@ -232,58 +232,6 @@ impl Read for Input {
}
}
-// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
-// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
-// can support at least 64 KiB, and there's some performance benefit to using
-// bigger reads, so that's what we use here.
-fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
- let mut buffer = [0; 65536];
- let mut total = 0;
- loop {
- match reader.read(&mut buffer) {
- Ok(0) => return Ok(total),
- Ok(n) => {
- hasher.update(&buffer[..n]);
- total += n as u64;
- }
- Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
- Err(e) => return Err(e),
- }
- }
-}
-
-// Mmap a file, if it looks like a good idea. Return None in cases where we
-// know mmap will fail, or if the file is short enough that mmapping isn't
-// worth it. However, if we do try to mmap and it fails, return the error.
-fn maybe_memmap_file(file: &File) -> Result<Option<memmap2::Mmap>> {
- let metadata = file.metadata()?;
- let file_size = metadata.len();
- Ok(if !metadata.is_file() {
- // Not a real file.
- None
- } else if file_size > isize::max_value() as u64 {
- // Too long to safely map.
- // https://github.com/danburkert/memmap-rs/issues/69
- None
- } else if file_size == 0 {
- // Mapping an empty file currently fails.
- // https://github.com/danburkert/memmap-rs/issues/72
- None
- } else if file_size < 16 * 1024 {
- // Mapping small files is not worth it.
- None
- } else {
- // Explicitly set the length of the memory map, so that filesystem
- // changes can't race to violate the invariants we just checked.
- let map = unsafe {
- memmap2::MmapOptions::new()
- .len(file_size as usize)
- .map(file)?
- };
- Some(map)
- })
-}
-
fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> {
// Encoding multiples of the 64 bytes is most efficient.
// TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
diff --git a/src/file.rs b/src/file.rs
new file mode 100644
index 0000000..81ccbbe
--- /dev/null
+++ b/src/file.rs
@@ -0,0 +1,67 @@
+//! The file-related utilities.
+//!
+//! # Examples
+//!
+//! ```no_run
+//! use std::io;
+//!
+//! use blake3::file::hash_path_maybe_mmap;
+//!
+//! fn main() -> io::Result<()> {
+//! let args: Vec<_> = std::env::args_os().collect();
+//! assert_eq!(args.len(), 2);
+//! let path = &args[1];
+//! let mut hasher = blake3::Hasher::new();
+//! hash_path_maybe_mmap(&mut hasher, path)?;
+//! println!("{}", hasher.finalize());
+//! Ok(())
+//! }
+//! ```
+
+use std::{fs::File, io, path::Path};
+
+/// Mmap a file, if it looks like a good idea. Return None in cases where we
+/// know mmap will fail, or if the file is short enough that mmapping isn't
+/// worth it. However, if we do try to mmap and it fails, return the error.
+pub fn maybe_memmap_file(file: &File) -> io::Result<Option<memmap2::Mmap>> {
+ let metadata = file.metadata()?;
+ let file_size = metadata.len();
+ #[allow(clippy::if_same_then_else)]
+ if !metadata.is_file() {
+ // Not a real file.
+ Ok(None)
+ } else if file_size > isize::max_value() as u64 {
+ // Too long to safely map.
+ // https://github.com/danburkert/memmap-rs/issues/69
+ Ok(None)
+ } else if file_size == 0 {
+ // Mapping an empty file currently fails.
+ // https://github.com/danburkert/memmap-rs/issues/72
+ Ok(None)
+ } else if file_size < 16 * 1024 {
+ // Mapping small files is not worth it.
+ Ok(None)
+ } else {
+ // Explicitly set the length of the memory map, so that filesystem
+ // changes can't race to violate the invariants we just checked.
+ let map = unsafe {
+ memmap2::MmapOptions::new()
+ .len(file_size as usize)
+ .map(file)?
+ };
+ Ok(Some(map))
+ }
+}
+
+/// Hash a file fast.
+///
+/// It may use mmap if the file is big enough. If not, it will read the whole file into a buffer.
+pub fn hash_path_maybe_mmap(hasher: &mut crate::Hasher, path: impl AsRef<Path>) -> io::Result<()> {
+ let file = File::open(path.as_ref())?;
+ if let Some(mmap) = maybe_memmap_file(&file)? {
+ hasher.update_rayon(&mmap);
+ } else {
+ crate::copy_wide(&file, hasher)?;
+ }
+ Ok(())
+}
diff --git a/src/lib.rs b/src/lib.rs
index 52971b7..b262380 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,6 +115,9 @@ mod sse41;
#[cfg(feature = "traits-preview")]
pub mod traits;
+#[cfg(feature = "file")]
+pub mod file;
+
mod join;
use arrayref::{array_mut_ref, array_ref};
@@ -1352,6 +1355,29 @@ impl std::io::Write for Hasher {
}
}
+/// Copy from `reader` to `hasher`, returning the number of bytes read.
+///
+/// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
+/// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
+/// can support at least 64 KiB, and there's some performance benefit to using
+/// bigger reads, so that's what we use here.
+#[cfg(feature = "std")]
+pub fn copy_wide(mut reader: impl std::io::Read, hasher: &mut Hasher) -> std::io::Result<u64> {
+ let mut buffer = [0; 65536];
+ let mut total = 0;
+ loop {
+ match reader.read(&mut buffer) {
+ Ok(0) => return Ok(total),
+ Ok(n) => {
+ hasher.update(&buffer[..n]);
+ total += n as u64;
+ }
+ Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+ Err(e) => return Err(e),
+ }
+ }
+}
+
/// An incremental reader for extended output, returned by
/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof).
///