aboutsummaryrefslogtreecommitdiff
path: root/src/io.rs
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2023-09-10 15:18:11 -0700
committerJack O'Connor <[email protected]>2023-09-16 17:04:27 -0700
commitcb32f0bd1450991d86a8ceb3716fd591382cc507 (patch)
treef0112db7d1a4452facd92111eb7134a20270d60e /src/io.rs
parente0bb91564125407102af81e219399025aa2c24b9 (diff)
replace the new file module with inherent methods on Hasher
New methods: - update_reader - update_mmap - update_mmap_rayon These are more discoverable, more convenient, and safer. There are two problems I want to avoid by taking a `Path` instead of a `File`. First, exposing `Mmap` objects to the caller is fundamentally unsafe, and making `maybe_mmap_file` private avoids that issue. Second, taking a `File` raises questions about whether memory mapped reads should behave like regular file reads. (Should they respect the current seek position? Should they update the seek position?) Taking a `Path` from the caller and opening the `File` internally avoids these questions.
Diffstat (limited to 'src/io.rs')
-rw-r--r--src/io.rs79
1 files changed, 79 insertions, 0 deletions
diff --git a/src/io.rs b/src/io.rs
new file mode 100644
index 0000000..1c19881
--- /dev/null
+++ b/src/io.rs
@@ -0,0 +1,79 @@
+//! Helper functions for efficient IO.
+
+#[cfg(feature = "std")]
+pub(crate) fn copy_wide(
+ mut reader: impl std::io::Read,
+ hasher: &mut crate::Hasher,
+) -> std::io::Result<u64> {
+ let mut buffer = [0; 65536];
+ let mut total = 0;
+ loop {
+ match reader.read(&mut buffer) {
+ Ok(0) => return Ok(total),
+ Ok(n) => {
+ hasher.update(&buffer[..n]);
+ total += n as u64;
+ }
+ // see test_update_reader_interrupted
+ Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
+ Err(e) => return Err(e),
+ }
+ }
+}
+
+// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
+// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
+// fails, return the error.
+//
+// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
+// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get
+// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because
+// this function is crate-private, we can guarantee that all can ever happen in the event of a race
+// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of
+// which should risk memory corruption in a safe caller.
+//
+// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no
+// platform in the "real world" is ever going to do anything other than compute the "wrong answer"
+// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this?
+// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a
+// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have
+// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do
+// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to
+// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert
+// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give
+// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's
+// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to
+// construct a safe &i32 to the register if you're going to leak that reference to unknown callers.
+// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally
+// different here. Feedback needed.
+#[cfg(feature = "mmap")]
+pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> {
+ let metadata = file.metadata()?;
+ let file_size = metadata.len();
+ #[allow(clippy::if_same_then_else)]
+ if !metadata.is_file() {
+ // Not a real file.
+ Ok(None)
+ } else if file_size > isize::max_value() as u64 {
+ // Too long to safely map.
+ // https://github.com/danburkert/memmap-rs/issues/69
+ Ok(None)
+ } else if file_size == 0 {
+ // Mapping an empty file currently fails.
+ // https://github.com/danburkert/memmap-rs/issues/72
+ // See test_mmap_virtual_file.
+ Ok(None)
+ } else if file_size < 16 * 1024 {
+ // Mapping small files is not worth it.
+ Ok(None)
+ } else {
+ // Explicitly set the length of the memory map, so that filesystem
+ // changes can't race to violate the invariants we just checked.
+ let map = unsafe {
+ memmap2::MmapOptions::new()
+ .len(file_size as usize)
+ .map(file)?
+ };
+ Ok(Some(map))
+ }
+}