From cb32f0bd1450991d86a8ceb3716fd591382cc507 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 10 Sep 2023 15:18:11 -0700 Subject: replace the new file module with inherent methods on Hasher New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable, more convenient, and safer. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions. --- b3sum/Cargo.lock | 24 ++++++------ b3sum/Cargo.toml | 2 +- b3sum/src/main.rs | 110 ++++++++++++++++-------------------------------------- 3 files changed, 46 insertions(+), 90 deletions(-) (limited to 'b3sum') diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock index 3c7c737..d76d175 100644 --- a/b3sum/Cargo.lock +++ b/b3sum/Cargo.lock @@ -18,9 +18,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" +checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" [[package]] name = "anstyle-parse" @@ -131,9 +131,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.2" +version = "4.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a13b88d2c62ff462f88e4a121f17a82c1af05693a2f192b5c38d14de73c19f6" +checksum = "84ed82781cea27b43c9b106a979fe450a13a31aab0500595fb3fc06616de08e6" dependencies = [ "clap_builder", "clap_derive", @@ -307,9 +307,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.147" +version = "0.2.148" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" [[package]] name = "linux-raw-sys" @@ -369,9 +369,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ "unicode-ident", ] @@ -467,9 +467,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "2.0.32" +version = "2.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "91e02e55d62894af2a08aca894c6577281f76769ba47c94d5756bec8ac6e7373" dependencies = [ "proc-macro2", "quote", @@ -501,9 +501,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "utf8parse" diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml index 19b617e..dcd9f96 100644 --- a/b3sum/Cargo.toml +++ b/b3sum/Cargo.toml @@ -15,7 +15,7 @@ pure = ["blake3/pure"] [dependencies] anyhow = "1.0.25" -blake3 = { version = "1", path = "..", features = ["file", "rayon"] } +blake3 = { version = "1", path = "..", features = ["mmap", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" memmap2 = "0.7.0" diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs index 165c579..228737f 100644 --- a/b3sum/src/main.rs +++ b/b3sum/src/main.rs @@ -163,73 +163,22 @@ impl Args { } } -enum Input { - Mmap(io::Cursor), - File(File), - Stdin, -} - -impl Input { - // Open an input file, using mmap if appropriate. "-" means stdin. Note - // that this convention applies both to command line arguments, and to - // filepaths that appear in a checkfile. - fn open(path: &Path, args: &Args) -> Result { - if path == Path::new("-") { - if args.keyed() { - bail!("Cannot open `-` in keyed mode"); - } - return Ok(Self::Stdin); - } - let file = File::open(path)?; - if !args.no_mmap() { - if let Some(mmap) = blake3::file::maybe_memmap_file(&file)? { - return Ok(Self::Mmap(io::Cursor::new(mmap))); - } - } - Ok(Self::File(file)) - } - - fn hash(&mut self, args: &Args) -> Result { - let mut hasher = args.base_hasher.clone(); - match self { - // The fast path: If we mmapped the file successfully, hash using - // multiple threads. This doesn't work on stdin, or on some files, - // and it can also be disabled with --no-mmap. - Self::Mmap(cursor) => { - hasher.update_rayon(cursor.get_ref()); - } - // The slower paths, for stdin or files we didn't/couldn't mmap. - // This is currently all single-threaded. Doing multi-threaded - // hashing without memory mapping is tricky, since all your worker - // threads have to stop every time you refill the buffer, and that - // ends up being a lot of overhead. To solve that, we need a more - // complicated double-buffering strategy where a background thread - // fills one buffer while the worker threads are hashing the other - // one. We might implement that in the future, but since this is - // the slow path anyway, it's not high priority. - Self::File(file) => { - blake3::copy_wide(file, &mut hasher)?; - } - Self::Stdin => { - let stdin = io::stdin(); - let lock = stdin.lock(); - blake3::copy_wide(lock, &mut hasher)?; - } - } - let mut output_reader = hasher.finalize_xof(); - output_reader.set_position(args.seek()); - Ok(output_reader) - } -} - -impl Read for Input { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - match self { - Self::Mmap(cursor) => cursor.read(buf), - Self::File(file) => file.read(buf), - Self::Stdin => io::stdin().read(buf), +fn hash_path(args: &Args, path: &Path) -> Result { + let mut hasher = args.base_hasher.clone(); + if path == Path::new("-") { + if args.keyed() { + bail!("Cannot open `-` in keyed mode"); } + hasher.update_reader(io::stdin().lock())?; + } else if args.no_mmap() { + hasher.update_reader(File::open(path)?)?; + } else { + // The fast path: Try to mmap the file and hash it with multiple threads. + hasher.update_mmap_rayon(path)?; } + let mut output_reader = hasher.finalize_xof(); + output_reader.set_position(args.seek()); + Ok(output_reader) } fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { @@ -425,8 +374,7 @@ fn parse_check_line(mut line: &str) -> Result { } fn hash_one_input(path: &Path, args: &Args) -> Result<()> { - let mut input = Input::open(path, args)?; - let output = input.hash(args)?; + let output = hash_path(args, path)?; if args.raw() { write_raw_output(output, args)?; return Ok(()); @@ -470,15 +418,13 @@ fn check_one_line(line: &str, args: &Args) -> bool { } else { file_string }; - let hash_result: Result = Input::open(&file_path, args) - .and_then(|mut input| input.hash(args)) - .map(|mut hash_output| { + let found_hash: blake3::Hash; + match hash_path(args, &file_path) { + Ok(mut output) => { let mut found_hash_bytes = [0; blake3::OUT_LEN]; - hash_output.fill(&mut found_hash_bytes); - found_hash_bytes.into() - }); - let found_hash: blake3::Hash = match hash_result { - Ok(hash) => hash, + output.fill(&mut found_hash_bytes); + found_hash = found_hash_bytes.into(); + } Err(e) => { println!("{}: FAILED ({})", file_string, e); return false; @@ -497,8 +443,18 @@ fn check_one_line(line: &str, args: &Args) -> bool { } fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> { - let checkfile_input = Input::open(path, args)?; - let mut bufreader = io::BufReader::new(checkfile_input); + let mut file; + let stdin; + let mut stdin_lock; + let mut bufreader: io::BufReader<&mut dyn Read>; + if path == Path::new("-") { + stdin = io::stdin(); + stdin_lock = stdin.lock(); + bufreader = io::BufReader::new(&mut stdin_lock); + } else { + file = File::open(path)?; + bufreader = io::BufReader::new(&mut file); + } let mut line = String::new(); loop { line.clear(); -- cgit v1.2.3