aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2024-01-21 21:15:45 -0800
committerJack O'Connor <[email protected]>2024-01-21 21:51:53 -0800
commitbd160e33a28c048e8c371bf49d08169a3b7d9edd (patch)
treebb00539ec2c3506bf2e32ef07ea804073f3ee82a
parent5558fa46239742720d84c46edb0544732adf4db8 (diff)
factor out the `blake3` crate changes from the guts_api branch
This commit and the branch that it starts are unlikely to land as-is, but I want to maintain them while I flesh out the new `blake3_guts` sub-crate.
-rw-r--r--Cargo.toml2
-rw-r--r--b3sum/Cargo.lock12
-rw-r--r--b3sum/src/main.rs2
-rw-r--r--benches/bench.rs175
-rw-r--r--src/lib.rs551
-rw-r--r--src/portable.rs8
-rw-r--r--src/test.rs408
-rw-r--r--test_vectors/Cargo.toml1
-rwxr-xr-xtest_vectors/cross_test.sh3
-rw-r--r--test_vectors/src/lib.rs2
10 files changed, 339 insertions, 825 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 591a6e3..fe73436 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -92,8 +92,8 @@ no_neon = []
features = ["mmap", "rayon", "serde", "zeroize"]
[dependencies]
-arrayref = "0.3.5"
arrayvec = { version = "0.7.4", default-features = false }
+blake3_guts = { path = "rust/guts" }
constant_time_eq = "0.3.0"
cfg-if = "1.0.0"
digest = { version = "0.10.1", features = [ "mac" ], optional = true }
diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock
index 10caffb..7bab0b0 100644
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@@ -57,12 +57,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
[[package]]
-name = "arrayref"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
-
-[[package]]
name = "arrayvec"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -98,8 +92,8 @@ checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
name = "blake3"
version = "1.5.0"
dependencies = [
- "arrayref",
"arrayvec",
+ "blake3_guts",
"cc",
"cfg-if",
"constant_time_eq",
@@ -108,6 +102,10 @@ dependencies = [
]
[[package]]
+name = "blake3_guts"
+version = "0.0.0"
+
+[[package]]
name = "cc"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index 228737f..3dadc00 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -186,7 +186,7 @@ fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()>
// TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
// We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF.
let mut len = args.len();
- let mut block = [0; blake3::guts::BLOCK_LEN];
+ let mut block = [0; 64];
while len > 0 {
output.fill(&mut block);
let hex_str = hex::encode(&block[..]);
diff --git a/benches/bench.rs b/benches/bench.rs
index 5efb9e6..e057d24 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -2,11 +2,7 @@
extern crate test;
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
-use blake3::platform::{Platform, MAX_SIMD_DEGREE};
-use blake3::OUT_LEN;
+use blake3_guts::BLOCK_LEN;
use rand::prelude::*;
use test::Bencher;
@@ -49,175 +45,6 @@ impl RandomInput {
}
}
-fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
- let mut state = [1u32; 8];
- let mut r = RandomInput::new(b, 64);
- let input = array_ref!(r.get(), 0, 64);
- b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
-}
-
-#[bench]
-fn bench_single_compression_portable(b: &mut Bencher) {
- bench_single_compression_fn(b, Platform::portable());
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse2(b: &mut Bencher) {
- if let Some(platform) = Platform::sse2() {
- bench_single_compression_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse41(b: &mut Bencher) {
- if let Some(platform) = Platform::sse41() {
- bench_single_compression_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_single_compression_avx512(b: &mut Bencher) {
- if let Some(platform) = Platform::avx512() {
- bench_single_compression_fn(b, platform);
- }
-}
-
-fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
- let degree = platform.simd_degree();
- let mut inputs = Vec::new();
- for _ in 0..degree {
- inputs.push(RandomInput::new(b, CHUNK_LEN));
- }
- b.iter(|| {
- let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- platform.hash_many(
- &input_arrays[..],
- &[0; 8],
- 0,
- blake3::IncrementCounter::Yes,
- 0,
- 0,
- 0,
- &mut out,
- );
- });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse2(b: &mut Bencher) {
- if let Some(platform) = Platform::sse2() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse41(b: &mut Bencher) {
- if let Some(platform) = Platform::sse41() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_avx2(b: &mut Bencher) {
- if let Some(platform) = Platform::avx2() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_chunks_avx512(b: &mut Bencher) {
- if let Some(platform) = Platform::avx512() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_chunks_neon(b: &mut Bencher) {
- if let Some(platform) = Platform::neon() {
- bench_many_chunks_fn(b, platform);
- }
-}
-
-// TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
- let degree = platform.simd_degree();
- let mut inputs = Vec::new();
- for _ in 0..degree {
- inputs.push(RandomInput::new(b, BLOCK_LEN));
- }
- b.iter(|| {
- let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- platform.hash_many(
- &input_arrays[..],
- &[0; 8],
- 0,
- blake3::IncrementCounter::No,
- 0,
- 0,
- 0,
- &mut out,
- );
- });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse2(b: &mut Bencher) {
- if let Some(platform) = Platform::sse2() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse41(b: &mut Bencher) {
- if let Some(platform) = Platform::sse41() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_avx2(b: &mut Bencher) {
- if let Some(platform) = Platform::avx2() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_parents_avx512(b: &mut Bencher) {
- if let Some(platform) = Platform::avx512() {
- bench_many_parents_fn(b, platform);
- }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_parents_neon(b: &mut Bencher) {
- if let Some(platform) = Platform::neon() {
- bench_many_parents_fn(b, platform);
- }
-}
-
fn bench_atonce(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| blake3::hash(input.get()));
diff --git a/src/lib.rs b/src/lib.rs
index 1fe47bf..efc1fc4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -88,109 +88,29 @@
#[cfg(test)]
mod test;
-// The guts module is for incremental use cases like the `bao` crate that need
-// to explicitly compute chunk and parent chaining values. It is semi-stable
-// and likely to keep working, but largely undocumented and not intended for
-// widespread use.
-#[doc(hidden)]
-pub mod guts;
-
-/// Undocumented and unstable, for benchmarks only.
-#[doc(hidden)]
-pub mod platform;
-
-// Platform-specific implementations of the compression function. These
-// BLAKE3-specific cfg flags are set in build.rs.
-#[cfg(blake3_avx2_rust)]
-#[path = "rust_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx2_ffi)]
-#[path = "ffi_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx512_ffi)]
-#[path = "ffi_avx512.rs"]
-mod avx512;
-#[cfg(blake3_neon)]
-#[path = "ffi_neon.rs"]
-mod neon;
-mod portable;
-#[cfg(blake3_sse2_rust)]
-#[path = "rust_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse2_ffi)]
-#[path = "ffi_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse41_rust)]
-#[path = "rust_sse41.rs"]
-mod sse41;
-#[cfg(blake3_sse41_ffi)]
-#[path = "ffi_sse41.rs"]
-mod sse41;
-
#[cfg(feature = "traits-preview")]
pub mod traits;
mod io;
mod join;
-use arrayref::{array_mut_ref, array_ref};
use arrayvec::{ArrayString, ArrayVec};
use core::cmp;
use core::fmt;
-use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};
-/// The number of bytes in a [`Hash`](struct.Hash.html), 32.
+use blake3_guts as guts;
+use guts::{
+ BlockBytes, CVBytes, BLOCK_LEN, CHUNK_END, CHUNK_LEN, CHUNK_START, DERIVE_KEY_CONTEXT,
+ DERIVE_KEY_MATERIAL, IV_BYTES, KEYED_HASH, PARENT, ROOT,
+};
+
+/// The number of bytes in a [`Hash`](struct.Hash.html), 32
pub const OUT_LEN: usize = 32;
-/// The number of bytes in a key, 32.
+/// The number of bytes in a key, 32
pub const KEY_LEN: usize = 32;
const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
-use guts::{BLOCK_LEN, CHUNK_LEN};
-
-// While iterating the compression function within a chunk, the CV is
-// represented as words, to avoid doing two extra endianness conversions for
-// each compression in the portable implementation. But the hash_many interface
-// needs to hash both input bytes and parent nodes, so its better for its
-// output CVs to be represented as bytes.
-type CVWords = [u32; 8];
-type CVBytes = [u8; 32]; // little-endian
-
-const IV: &CVWords = &[
- 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
-];
-
-const MSG_SCHEDULE: [[usize; 16]; 7] = [
- [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
- [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
- [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
- [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
- [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
- [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
- [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
-];
-
-// These are the internal flags that we use to domain separate root/non-root,
-// chunk/parent, and chunk beginning/middle/end. These get set at the high end
-// of the block flags word in the compression function, so their values start
-// high and go down.
-const CHUNK_START: u8 = 1 << 0;
-const CHUNK_END: u8 = 1 << 1;
-const PARENT: u8 = 1 << 2;
-const ROOT: u8 = 1 << 3;
-const KEYED_HASH: u8 = 1 << 4;
-const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
-const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
-
-#[inline]
-fn counter_low(counter: u64) -> u32 {
- counter as u32
-}
-
-#[inline]
-fn counter_high(counter: u64) -> u32 {
- (counter >> 32) as u32
-}
/// An output of the default size, 32 bytes, which provides constant-time
/// equality checking.
@@ -219,19 +139,19 @@ fn counter_high(counter: u64) -> u32 {
#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[derive(Clone, Copy, Hash)]
-pub struct Hash([u8; OUT_LEN]);
+pub struct Hash(CVBytes);
impl Hash {
/// The raw bytes of the `Hash`. Note that byte arrays don't provide
/// constant-time equality checking, so if you need to compare hashes,
/// prefer the `Hash` type.
#[inline]
- pub const fn as_bytes(&self) -> &[u8; OUT_LEN] {
+ pub const fn as_bytes(&self) -> &CVBytes {
&self.0
}
/// Create a `Hash` from its raw bytes representation.
- pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self {
+ pub const fn from_bytes(bytes: CVBytes) -> Self {
Self(bytes)
}
@@ -275,7 +195,7 @@ impl Hash {
if hex_bytes.len() != OUT_LEN * 2 {
return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len())));
}
- let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN];
+ let mut hash_bytes: CVBytes = [0; OUT_LEN];
for i in 0..OUT_LEN {
hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?;
}
@@ -283,14 +203,14 @@ impl Hash {
}
}
-impl From<[u8; OUT_LEN]> for Hash {
+impl From<CVBytes> for Hash {
#[inline]
- fn from(bytes: [u8; OUT_LEN]) -> Self {
+ fn from(bytes: CVBytes) -> Self {
Self::from_bytes(bytes)
}
}
-impl From<Hash> for [u8; OUT_LEN] {
+impl From<Hash> for CVBytes {
#[inline]
fn from(hash: Hash) -> Self {
hash.0
@@ -314,9 +234,9 @@ impl PartialEq for Hash {
}
/// This implementation is constant-time.
-impl PartialEq<[u8; OUT_LEN]> for Hash {
+impl PartialEq<CVBytes> for Hash {
#[inline]
- fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
+ fn eq(&self, other: &CVBytes) -> bool {
constant_time_eq::constant_time_eq_32(&self.0, other)
}
}
@@ -395,70 +315,56 @@ impl std::error::Error for HexError {}
#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
#[derive(Clone)]
struct Output {
- input_chaining_value: CVWords,
- block: [u8; 64],
+ input_chaining_value: CVBytes,
+ block: BlockBytes,
block_len: u8,
counter: u64,
flags: u8,
- #[cfg_attr(feature = "zeroize", zeroize(skip))]
- platform: Platform,
}
impl Output {
fn chaining_value(&self) -> CVBytes {
- let mut cv = self.input_chaining_value;
- self.platform.compress_in_place(
- &mut cv,
+ guts::DETECTED_IMPL.compress(
&self.block,
- self.block_len,
+ self.block_len as u32,
+ &self.input_chaining_value,
self.counter,
- self.flags,
- );
- platform::le_bytes_from_words_32(&cv)
+ self.flags as u32,
+ )
}
fn root_hash(&self) -> Hash {
debug_assert_eq!(self.counter, 0);
- let mut cv = self.input_chaining_value;
- self.platform
- .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
- Hash(platform::le_bytes_from_words_32(&cv))
- }
-
- fn root_output_block(&self) -> [u8; 2 * OUT_LEN] {
- self.platform.compress_xof(
- &self.input_chaining_value,
+ Hash(guts::DETECTED_IMPL.compress(
&self.block,
- self.block_len,
- self.counter,
- self.flags | ROOT,
- )
+ self.block_len as u32,
+ &self.input_chaining_value,
+ 0,
+ self.flags as u32 | ROOT,
+ ))
}
}
#[derive(Clone)]
#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
struct ChunkState {
- cv: CVWords,
+ cv: CVBytes,
chunk_counter: u64,
- buf: [u8; BLOCK_LEN],
+ buf: BlockBytes,
buf_len: u8,
blocks_compressed: u8,
flags: u8,
- #[cfg_attr(feature = "zeroize", zeroize(skip))]
- platform: Platform,
}
impl ChunkState {
- fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
+ fn new(key: &CVBytes, chunk_counter: u64, flags: u32) -> Self {
Self {
cv: *key,
chunk_counter,
buf: [0; BLOCK_LEN],
buf_len: 0,
blocks_compressed: 0,
- flags,
- platform,
+ flags: flags as u8,
}
}
@@ -474,7 +380,7 @@ impl ChunkState {
*input = &input[take..];
}
- fn start_flag(&self) -> u8 {
+ fn start_flag(&self) -> u32 {
if self.blocks_compressed == 0 {
CHUNK_START
} else {
@@ -489,13 +395,12 @@ impl ChunkState {
self.fill_buf(&mut input);
if !input.is_empty() {
debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
- let block_flags = self.flags | self.start_flag(); // borrowck
- self.platform.compress_in_place(
- &mut self.cv,
+ self.cv = guts::DETECTED_IMPL.compress(
&self.buf,
- BLOCK_LEN as u8,
+ BLOCK_LEN as u32,
+ &self.cv,
self.chunk_counter,
- block_flags,
+ self.flags as u32 | self.start_flag(),
);
self.buf_len = 0;
self.buf = [0; BLOCK_LEN];
@@ -505,13 +410,12 @@ impl ChunkState {
while input.len() > BLOCK_LEN {
debug_assert_eq!(self.buf_len, 0);
- let block_flags = self.flags | self.start_flag(); // borrowck
- self.platform.compress_in_place(
- &mut self.cv,
- array_ref!(input, 0, BLOCK_LEN),
- BLOCK_LEN as u8,
+ self.cv = guts::DETECTED_IMPL.compress(
+ input[..BLOCK_LEN].try_into().unwrap(),
+ BLOCK_LEN as u32,
+ &self.cv,
self.chunk_counter,
- block_flags,
+ self.flags as u32 | self.start_flag(),
);
self.blocks_compressed += 1;
input = &input[BLOCK_LEN..];
@@ -524,14 +428,12 @@ impl ChunkState {
}
fn output(&self) -> Output {
- let block_flags = self.flags | self.start_flag() | CHUNK_END;
Output {
input_chaining_value: self.cv,
block: self.buf,
block_len: self.buf_len,
counter: self.chunk_counter,
- flags: block_flags,
- platform: self.platform,
+ flags: self.flags | self.start_flag() as u8 | CHUNK_END as u8,
}
}
}
@@ -543,7 +445,6 @@ impl fmt::Debug for ChunkState {
.field("len", &self.len())
.field("chunk_counter", &self.chunk_counter)
.field("flags", &self.flags)
- .field("platform", &self.platform)
.finish()
}
}
@@ -563,131 +464,6 @@ impl fmt::Debug for ChunkState {
// use full-width SIMD vectors for parent hashing. Without parallel parent
// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
-/// Undocumented and unstable, for benchmarks only.
-#[doc(hidden)]
-#[derive(Clone, Copy)]
-pub enum IncrementCounter {
- Yes,
- No,
-}
-
-impl IncrementCounter {
- #[inline]
- fn yes(&self) -> bool {
- match self {
- IncrementCounter::Yes => true,
- IncrementCounter::No => false,
- }
- }
-}
-
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
-fn largest_power_of_two_leq(n: usize) -> usize {
- ((n / 2) + 1).next_power_of_two()
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-fn left_len(content_len: usize) -> usize {
- debug_assert!(content_len > CHUNK_LEN);
- // Subtract 1 to reserve at least one byte for the right side.
- let full_chunks = (content_len - 1) / CHUNK_LEN;
- largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
-// on a single thread. Write out the chunk chaining values and return the
-// number of chunks hashed. These chunks are never the root and never empty;
-// those cases use a different codepath.
-fn compress_chunks_parallel(
- input: &[u8],
- key: &CVWords,
- chunk_counter: u64,
- flags: u8,
- platform: Platform,
- out: &mut [u8],
-) -> usize {
- debug_assert!(!input.is_empty(), "empty chunks below the root");
- debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
-
- let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
- let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new();
- for chunk in &mut chunks_exact {
- chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN));
- }
- platform.hash_many(
- &chunks_array,
- key,
- chunk_counter,
- IncrementCounter::Yes,
- flags,
- CHUNK_START,
- CHUNK_END,
- out,
- );
-
- // Hash the remaining partial chunk, if there is one. Note that the empty
- // chunk (meaning the empty message) is a different codepath.
- let chunks_so_far = chunks_array.len();
- if !chunks_exact.remainder().is_empty() {
- let counter = chunk_counter + chunks_so_far as u64;
- let mut chunk_state = ChunkState::new(key, counter, flags, platform);
- chunk_state.update(chunks_exact.remainder());
- *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) =
- chunk_state.output().chaining_value();
- chunks_so_far + 1
- } else {
- chunks_so_far
- }
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
-// on a single thread. Write out the parent chaining values and return the
-// number of parents hashed. (If there's an odd input chaining value left over,
-// return it as an additional output.) These parents are never the root and
-// never empty; those cases use a different codepath.
-fn compress_parents_parallel(
- child_chaining_values: &[u8],
- key: &CVWords,
- flags: u8,
- platform: Platform,
- out: &mut [u8],
-) -> usize {
- debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes");
- let num_children = child_chaining_values.len() / OUT_LEN;
- debug_assert!(num_children >= 2, "not enough children");
- debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many");
-
- let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN);
- // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of
- // the requirements of compress_subtree_wide().
- let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new();
- for parent in &mut parents_exact {
- parents_array.push(array_ref!(parent, 0, BLOCK_LEN));
- }
- platform.hash_many(
- &parents_array,
- key,
- 0, // Parents always use counter 0.
- IncrementCounter::No,
- flags | PARENT,
- 0, // Parents have no start flags.
- 0, // Parents have no end flags.
- out,
- );
-
- // If there's an odd child left over, it becomes an output.
- let parents_so_far = parents_array.len();
- if !parents_exact.remainder().is_empty() {
- out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder());
- parents_so_far + 1
- } else {
- parents_so_far
- }
-}
-
// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
@@ -707,66 +483,41 @@ fn compress_parents_parallel(
// multithreading parallelism for that update().
fn compress_subtree_wide<J: join::Join>(
input: &[u8],
- key: &CVWords,
+ key: &CVBytes,
chunk_counter: u64,
- flags: u8,
- platform: Platform,
- out: &mut [u8],
+ flags: u32,
+ out: guts::TransposedSplit,
) -> usize {
// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. This allows Rayon the option of multithreading even the
// 2-chunk case, which can help performance on smaller platforms.
- if input.len() <= platform.simd_degree() * CHUNK_LEN {
- return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
+ let degree = guts::DETECTED_IMPL.degree();
+ if input.len() <= degree * CHUNK_LEN {
+ return guts::DETECTED_IMPL.hash_chunks(input, key, chunk_counter, flags, out);
}
// With more than simd_degree chunks, we need to recurse. Start by dividing
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
- debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
- let (left, right) = input.split_at(left_len(input.len()));
+ debug_assert_eq!(degree.count_ones(), 1, "power of 2");
+ let (left, right) = input.split_at(guts::left_len(input.len()));
let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
- // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
- // account for the special case of returning 2 outputs when the SIMD degree
- // is 1.
- let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
- let degree = if left.len() == CHUNK_LEN {
- // The "simd_degree=1 and we're at the leaf nodes" case.
- debug_assert_eq!(platform.simd_degree(), 1);
- 1
- } else {
- cmp::max(platform.simd_degree(), 2)
- };
- let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN);
+ let mut transposed_cvs = guts::TransposedVectors::new();
+ let (left_cvs, right_cvs) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs);
// Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple
// threads.
let (left_n, right_n) = J::join(
- || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out),
- || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out),
+ || compress_subtree_wide::<J>(left, key, chunk_counter, flags, left_cvs),
+ || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, right_cvs),
);
- // The special case again. If simd_degree=1, then we'll have left_n=1 and
- // right_n=1. Rather than compressing them into a single output, return
- // them directly, to make sure we always have at least two outputs.
- debug_assert_eq!(left_n, degree);
- debug_assert!(right_n >= 1 && right_n <= left_n);
- if left_n == 1 {
- out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]);
- return 2;
- }
-
- // Otherwise, do one layer of parent node compression.
- let num_children = left_n + right_n;
- compress_parents_parallel(
- &cv_array[..num_children * OUT_LEN],
- key,
- flags,
- platform,
- out,
- )
+ // Do one layer of parent node compression. The SIMD degree is always at least 2, so we're
+ // guaranteed that this isn't the root compression.
+ let num_cvs = left_n + right_n;
+ guts::DETECTED_IMPL.hash_parents(&mut transposed_cvs, num_cvs, key, flags, out)
}
// Hash a subtree with compress_subtree_wide(), and then condense the resulting
@@ -781,50 +532,41 @@ fn compress_subtree_wide<J: join::Join>(
// chunk or less. That's a different codepath.
fn compress_subtree_to_parent_node<J: join::Join>(
input: &[u8],
- key: &CVWords,
+ key: &CVBytes,
chunk_counter: u64,
- flags: u8,
- platform: Platform,
-) -> [u8; BLOCK_LEN] {
+ flags: u32,
+) -> BlockBytes {
debug_assert!(input.len() > CHUNK_LEN);
- let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
- let mut num_cvs =
- compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array);
+ let mut transposed_cvs = guts::TransposedVectors::new();
+ let (left_cvs, _) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs);
+ let mut num_cvs = compress_subtree_wide::<J>(input, &key, chunk_counter, flags, left_cvs);
debug_assert!(num_cvs >= 2);
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
- let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2];
while num_cvs > 2 {
- let cv_slice = &cv_array[..num_cvs * OUT_LEN];
- num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array);
- cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]);
+ num_cvs = guts::DETECTED_IMPL.reduce_parents(&mut transposed_cvs, num_cvs, key, flags);
}
- *array_ref!(cv_array, 0, 2 * OUT_LEN)
+ transposed_cvs.extract_parent_node(0)
}
// Hash a complete input all at once. Unlike compress_subtree_wide() and
// compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Output {
- let platform = Platform::detect();
-
+fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVBytes, flags: u32) -> Output {
// If the whole subtree is one chunk, hash it directly with a ChunkState.
if input.len() <= CHUNK_LEN {
- return ChunkState::new(key, 0, flags, platform)
- .update(input)
- .output();
+ return ChunkState::new(key, 0, flags).update(input).output();
}
// Otherwise construct an Output object from the parent node returned by
// compress_subtree_to_parent_node().
Output {
input_chaining_value: *key,
- block: compress_subtree_to_parent_node::<J>(input, key, 0, flags, platform),
+ block: compress_subtree_to_parent_node::<J>(input, key, 0, flags),
block_len: BLOCK_LEN as u8,
counter: 0,
- flags: flags | PARENT,
- platform,
+ flags: flags as u8 | PARENT as u8,
}
}
@@ -839,7 +581,7 @@ fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Ou
/// This function is always single-threaded. For multithreading support, see
/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
pub fn hash(input: &[u8]) -> Hash {
- hash_all_at_once::<join::SerialJoin>(input, IV, 0).root_hash()
+ hash_all_at_once::<join::SerialJoin>(input, &IV_BYTES, 0).root_hash()
}
/// The keyed hash function.
@@ -856,9 +598,8 @@ pub fn hash(input: &[u8]) -> Hash {
/// This function is always single-threaded. For multithreading support, see
/// [`Hasher::new_keyed`] and
/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
-pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
- let key_words = platform::words_from_le_bytes_32(key);
- hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash()
+pub fn keyed_hash(key: &CVBytes, input: &[u8]) -> Hash {
+ hash_all_at_once::<join::SerialJoin>(input, key, KEYED_HASH).root_hash()
}
/// The key derivation function.
@@ -896,12 +637,11 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
///
/// [Argon2]: https://en.wikipedia.org/wiki/Argon2
-pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
+pub fn derive_key(context: &str, key_material: &[u8]) -> CVBytes {
let context_key =
- hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+ hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
.root_hash();
- let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
- hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
+ hash_all_at_once::<join::SerialJoin>(key_material, context_key.as_bytes(), DERIVE_KEY_MATERIAL)
.root_hash()
.0
}
@@ -909,9 +649,8 @@ pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
fn parent_node_output(
left_child: &CVBytes,
right_child: &CVBytes,
- key: &CVWords,
- flags: u8,
- platform: Platform,
+ key: &CVBytes,
+ flags: u32,
) -> Output {
let mut block = [0; BLOCK_LEN];
block[..32].copy_from_slice(left_child);
@@ -921,8 +660,7 @@ fn parent_node_output(
block,
block_len: BLOCK_LEN as u8,
counter: 0,
- flags: flags | PARENT,
- platform,
+ flags: (flags | PARENT) as u8,
}
}
@@ -963,7 +701,7 @@ fn parent_node_output(
#[derive(Clone)]
#[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
pub struct Hasher {
- key: CVWords,
+ key: CVBytes,
chunk_state: ChunkState,
// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
@@ -974,26 +712,25 @@ pub struct Hasher {
}
impl Hasher {
- fn new_internal(key: &CVWords, flags: u8) -> Self {
+ fn new_internal(key: &CVBytes, flags: u32) -> Self {
Self {
key: *key,
- chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+ chunk_state: ChunkState::new(key, 0, flags),
cv_stack: ArrayVec::new(),
}
}
/// Construct a new `Hasher` for the regular hash function.
pub fn new() -> Self {
- Self::new_internal(IV, 0)
+ Self::new_internal(&IV_BYTES, 0)
}
/// Construct a new `Hasher` for the keyed hash function. See
/// [`keyed_hash`].
///
/// [`keyed_hash`]: fn.keyed_hash.html
- pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
- let key_words = platform::words_from_le_bytes_32(key);
- Self::new_internal(&key_words, KEYED_HASH)
+ pub fn new_keyed(key: &CVBytes) -> Self {
+ Self::new_internal(key, KEYED_HASH)
}
/// Construct a new `Hasher` for the key derivation function. See
@@ -1003,10 +740,9 @@ impl Hasher {
/// [`derive_key`]: fn.derive_key.html
pub fn new_derive_key(context: &str) -> Self {
let context_key =
- hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+ hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
.root_hash();
- let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
- Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
+ Self::new_internal(context_key.as_bytes(), DERIVE_KEY_MATERIAL)
}
/// Reset the `Hasher` to its initial state.
@@ -1014,12 +750,7 @@ impl Hasher {
/// This is functionally the same as overwriting the `Hasher` with a new
/// one, using the same key or context string if any.
pub fn reset(&mut self) -> &mut Self {
- self.chunk_state = ChunkState::new(
- &self.key,
- 0,
- self.chunk_state.flags,
- self.chunk_state.platform,
- );
+ self.chunk_state = ChunkState::new(&self.key, 0, self.chunk_state.flags as u32);
self.cv_stack.clear();
self
}
@@ -1044,8 +775,7 @@ impl Hasher {
&left_child,
&right_child,
&self.key,
- self.chunk_state.flags,
- self.chunk_state.platform,
+ self.chunk_state.flags as u32,
);
self.cv_stack.push(parent_output.chaining_value());
}
@@ -1118,8 +848,7 @@ impl Hasher {
self.chunk_state = ChunkState::new(
&self.key,
self.chunk_state.chunk_counter + 1,
- self.chunk_state.flags,
- self.chunk_state.platform,
+ self.chunk_state.flags as u32,
);
} else {
return self;
@@ -1142,7 +871,7 @@ impl Hasher {
while input.len() > CHUNK_LEN {
debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
- let mut subtree_len = largest_power_of_two_leq(input.len());
+ let mut subtree_len = guts::largest_power_of_two_leq(input.len());
let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
// Shrink the subtree_len until it evenly divides the count so far.
// We know that subtree_len itself is a power of 2, so we can use a
@@ -1174,8 +903,7 @@ impl Hasher {
&ChunkState::new(
&self.key,
self.chunk_state.chunk_counter,
- self.chunk_state.flags,
- self.chunk_state.platform,
+ self.chunk_state.flags as u32,
)
.update(&input[..subtree_len])
.output()
@@ -1189,11 +917,10 @@ impl Hasher {
&input[..subtree_len],
&self.key,
self.chunk_state.chunk_counter,
- self.chunk_state.flags,
- self.chunk_state.platform,
+ self.chunk_state.flags as u32,
);
- let left_cv = array_ref!(cv_pair, 0, 32);
- let right_cv = array_ref!(cv_pair, 32, 32);
+ let left_cv = cv_pair[..32].try_into().unwrap();
+ let right_cv = cv_pair[32..].try_into().unwrap();
// Push the two CVs we received into the CV stack in order. Because
// the stack merges lazily, this guarantees we aren't merging the
// root.
@@ -1256,8 +983,7 @@ impl Hasher {
&self.cv_stack[num_cvs_remaining - 2],
&self.cv_stack[num_cvs_remaining - 1],
&self.key,
- self.chunk_state.flags,
- self.chunk_state.platform,
+ self.chunk_state.flags as u32,
);
num_cvs_remaining -= 2;
}
@@ -1266,8 +992,7 @@ impl Hasher {
&self.cv_stack[num_cvs_remaining - 1],
&output.chaining_value(),
&self.key,
- self.chunk_state.flags,
- self.chunk_state.platform,
+ self.chunk_state.flags as u32,
);
num_cvs_remaining -= 1;
}
@@ -1481,7 +1206,6 @@ impl fmt::Debug for Hasher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Hasher")
.field("flags", &self.chunk_state.flags)
- .field("platform", &self.chunk_state.platform)
.finish()
}
}
@@ -1546,6 +1270,62 @@ impl OutputReader {
}
}
+ // There's some nontrivial logic here to handle partial blocks, and I don't want to copy-paste
+ // it between the xof and xof_xor cases.
+ #[inline(always)]
+ fn fill_inner(&mut self, mut buf: &mut [u8], xor: bool) {
+ debug_assert!(self.position_within_block < BLOCK_LEN as u8);
+ let xof_fn = if xor {
+ guts::Implementation::xof_xor
+ } else {
+ guts::Implementation::xof
+ };
+ if self.position_within_block != 0 {
+ // The xof() and xof_xor() APIs can handle a partial block at the end but not a partial
+ // block at the beginning. We handle the beginning case here. Start by computing the
+ // complete block that we need part of.
+ let mut partial_block = [0u8; 64];
+ xof_fn(
+ &guts::DETECTED_IMPL,
+ &self.inner.block,
+ self.inner.block_len as u32,
+ &self.inner.input_chaining_value,
+ self.inner.counter,
+ self.inner.flags as u32,
+ &mut partial_block,
+ );
+ let output_bytes = &partial_block[self.position_within_block as usize..];
+ let take = cmp::min(buf.len(), output_bytes.len());
+ if xor {
+ for byte_index in 0..take {
+ buf[byte_index] ^= output_bytes[byte_index];
+ }
+ } else {
+ buf[..take].copy_from_slice(&output_bytes[..take]);
+ }
+ buf = &mut buf[take..];
+ self.position_within_block += take as u8;
+ if self.position_within_block == BLOCK_LEN as u8 {
+ self.position_within_block = 0;
+ self.inner.counter += 1;
+ } else {
+ debug_assert!(buf.is_empty());
+ return;
+ }
+ }
+ xof_fn(
+ &guts::DETECTED_IMPL,
+ &self.inner.block,
+ self.inner.block_len as u32,
+ &self.inner.input_chaining_value,
+ self.inner.counter,
+ self.inner.flags as u32,
+ buf,
+ );
+ self.inner.counter += (buf.len() / BLOCK_LEN) as u64;
+ self.position_within_block = (buf.len() % BLOCK_LEN) as u8;
+ }
+
/// Fill a buffer with output bytes and advance the position of the
/// `OutputReader`. This is equivalent to [`Read::read`], except that it
/// doesn't return a `Result`. Both methods always fill the entire buffer.
@@ -1561,19 +1341,12 @@ impl OutputReader {
/// reading further, the behavior is unspecified.
///
/// [`Read::read`]: #method.read
- pub fn fill(&mut self, mut buf: &mut [u8]) {
- while !buf.is_empty() {
- let block: [u8; BLOCK_LEN] = self.inner.root_output_block();
- let output_bytes = &block[self.position_within_block as usize..];
- let take = cmp::min(buf.len(), output_bytes.len());
- buf[..take].copy_from_slice(&output_bytes[..take]);
- buf = &mut buf[take..];
- self.position_within_block += take as u8;
- if self.position_within_block == BLOCK_LEN as u8 {
- self.inner.counter += 1;
- self.position_within_block = 0;
- }
- }
+ pub fn fill(&mut self, buf: &mut [u8]) {
+ self.fill_inner(buf, false);
+ }
+
+ pub fn fill_xor(&mut self, buf: &mut [u8]) {
+ self.fill_inner(buf, true);
}
/// Return the current read position in the output stream. This is
diff --git a/src/portable.rs b/src/portable.rs
index 7af6828..b82bd7f 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -181,10 +181,10 @@ pub fn hash_many<const N: usize>(
pub mod test {
use super::*;
- // This is basically testing the portable implementation against itself,
- // but it also checks that compress_in_place and compress_xof are
- // consistent. And there are tests against the reference implementation and
- // against hardcoded test vectors elsewhere.
+ // These are basically testing the portable implementation against itself, but we also check
+ // that compress_in_place and compress_xof are consistent. And there are tests against the
+ // reference implementation and against hardcoded test vectors elsewhere.
+
#[test]
fn test_compress() {
crate::test::test_compress_fn(compress_in_place, compress_xof);
diff --git a/src/test.rs b/src/test.rs
index fb1e849..cfa1daa 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1,6 +1,6 @@
-use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
+use blake3_guts as guts;
+use guts::{CVBytes, CVWords, BLOCK_LEN, CHUNK_LEN};
+
use core::usize;
use rand::prelude::*;
@@ -46,172 +46,12 @@ pub const TEST_CASES: &[usize] = &[
pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN;
// There's a test to make sure these two are equal below.
-pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
-pub const TEST_KEY_WORDS: CVWords = [
- 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
-];
-
-// Paint the input with a repeating byte pattern. We use a cycle length of 251,
-// because that's the largest prime number less than 256. This makes it
-// unlikely to swapping any two adjacent input blocks or chunks will give the
-// same answer.
-pub fn paint_test_input(buf: &mut [u8]) {
- for (i, b) in buf.iter_mut().enumerate() {
- *b = (i % 251) as u8;
- }
-}
-
-type CompressInPlaceFn =
- unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
-
-type CompressXofFn = unsafe fn(
- cv: &CVWords,
- block: &[u8; BLOCK_LEN],
- block_len: u8,
- counter: u64,
- flags: u8,
-) -> [u8; 64];
-
-// A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
- let initial_state = TEST_KEY_WORDS;
- let block_len: u8 = 61;
- let mut block = [0; BLOCK_LEN];
- paint_test_input(&mut block[..block_len as usize]);
- // Use a counter with set bits in both 32-bit words.
- let counter = (5u64 << 32) + 6;
- let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
-
- let portable_out =
- crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags);
-
- let mut test_state = initial_state;
- unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) };
- let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
- let test_xof =
- unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) };
-
- assert_eq!(&portable_out[..32], &test_state_bytes[..]);
- assert_eq!(&portable_out[..], &test_xof[..]);
-}
-
-type HashManyFn<A> = unsafe fn(
- inputs: &[&A],
- key: &CVWords,
- counter: u64,
- increment_counter: IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-);
-
-// A shared helper function for platform-specific tests.
-pub fn test_hash_many_fn(
- hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>,
- hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>,
-) {
- // Test a few different initial counter values.
- // - 0: The base case.
- // - u32::MAX: The low word of the counter overflows for all inputs except the first.
- // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
- // when you're supposed to ANDNOT...
- let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
- for counter in initial_counters {
- #[cfg(feature = "std")]
- dbg!(counter);
-
- // 31 (16 + 8 + 4 + 2 + 1) inputs
- const NUM_INPUTS: usize = 31;
- let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
- crate::test::paint_test_input(&mut input_buf);
-
- // First hash chunks.
- let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new();
- for i in 0..NUM_INPUTS {
- chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN));
- }
- let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
- crate::portable::hash_many(
- &chunks,
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::Yes,
- crate::KEYED_HASH,
- crate::CHUNK_START,
- crate::CHUNK_END,
- &mut portable_chunks_out,
- );
-
- let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- hash_many_chunks_fn(
- &chunks[..],
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::Yes,
- crate::KEYED_HASH,
- crate::CHUNK_START,
- crate::CHUNK_END,
- &mut test_chunks_out,
- );
- }
- for n in 0..NUM_INPUTS {
- #[cfg(feature = "std")]
- dbg!(n);
- assert_eq!(
- &portable_chunks_out[n * OUT_LEN..][..OUT_LEN],
- &test_chunks_out[n * OUT_LEN..][..OUT_LEN]
- );
- }
-
- // Then hash parents.
- let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new();
- for i in 0..NUM_INPUTS {
- parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN));
- }
- let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
- crate::portable::hash_many(
- &parents,
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::No,
- crate::KEYED_HASH | crate::PARENT,
- 0,
- 0,
- &mut portable_parents_out,
- );
-
- let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN];
- unsafe {
- hash_many_parents_fn(
- &parents[..],
- &TEST_KEY_WORDS,
- counter,
- IncrementCounter::No,
- crate::KEYED_HASH | crate::PARENT,
- 0,
- 0,
- &mut test_parents_out,
- );
- }
- for n in 0..NUM_INPUTS {
- #[cfg(feature = "std")]
- dbg!(n);
- assert_eq!(
- &portable_parents_out[n * OUT_LEN..][..OUT_LEN],
- &test_parents_out[n * OUT_LEN..][..OUT_LEN]
- );
- }
- }
-}
+pub const TEST_KEY: &CVBytes = b"whats the Elvish word for friend";
+pub const TEST_KEY_WORDS: &CVWords = &guts::words_from_le_bytes_32(TEST_KEY);
#[test]
fn test_key_bytes_equal_key_words() {
- assert_eq!(
- TEST_KEY_WORDS,
- crate::platform::words_from_le_bytes_32(&TEST_KEY),
- );
+ assert_eq!(TEST_KEY, &guts::le_bytes_from_words_32(TEST_KEY_WORDS),);
}
#[test]
@@ -224,52 +64,9 @@ fn test_reference_impl_size() {
assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>());
}
-#[test]
-fn test_counter_words() {
- let counter: u64 = (1 << 32) + 2;
- assert_eq!(crate::counter_low(counter), 2);
- assert_eq!(crate::counter_high(counter), 1);
-}
-
-#[test]
-fn test_largest_power_of_two_leq() {
- let input_output = &[
- // The zero case is nonsensical, but it does work.
- (0, 1),
- (1, 1),
- (2, 2),
- (3, 2),
- (4, 4),
- (5, 4),
- (6, 4),
- (7, 4),
- (8, 8),
- // the largest possible usize
- (usize::MAX, (usize::MAX >> 1) + 1),
- ];
- for &(input, output) in input_output {
- assert_eq!(
- output,
- crate::largest_power_of_two_leq(input),
- "wrong output for n={}",
- input
- );
- }
-}
-
-#[test]
-fn test_left_len() {
- let input_output = &[
- (CHUNK_LEN + 1, CHUNK_LEN),
- (2 * CHUNK_LEN - 1, CHUNK_LEN),
- (2 * CHUNK_LEN, CHUNK_LEN),
- (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
- (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
- (4 * CHUNK_LEN, 2 * CHUNK_LEN),
- (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
- ];
- for &(input, output) in input_output {
- assert_eq!(crate::left_len(input), output);
+pub(crate) fn paint_test_input(buf: &mut [u8]) {
+ for (i, b) in buf.iter_mut().enumerate() {
+ *b = (i % 251) as u8;
}
}
@@ -292,18 +89,18 @@ fn test_compare_reference_impl() {
// all at once
let test_out = crate::hash(input);
- assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+ assert_eq!(test_out, expected_out[..32]);
// incremental
let mut hasher = crate::Hasher::new();
hasher.update(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+ assert_eq!(hasher.finalize(), expected_out[..32]);
assert_eq!(hasher.finalize(), test_out);
// incremental (rayon)
#[cfg(feature = "rayon")]
{
let mut hasher = crate::Hasher::new();
hasher.update_rayon(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+ assert_eq!(hasher.finalize(), expected_out[..32]);
assert_eq!(hasher.finalize(), test_out);
}
// xof
@@ -314,25 +111,25 @@ fn test_compare_reference_impl() {
// keyed
{
- let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+ let mut reference_hasher = reference_impl::Hasher::new_keyed(TEST_KEY);
reference_hasher.update(input);
let mut expected_out = [0; OUT];
reference_hasher.finalize(&mut expected_out);
// all at once
- let test_out = crate::keyed_hash(&TEST_KEY, input);
- assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+ let test_out = crate::keyed_hash(TEST_KEY, input);
+ assert_eq!(test_out, expected_out[..32]);
// incremental
- let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+ let mut hasher = crate::Hasher::new_keyed(TEST_KEY);
hasher.update(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+ assert_eq!(hasher.finalize(), expected_out[..32]);
assert_eq!(hasher.finalize(), test_out);
// incremental (rayon)
#[cfg(feature = "rayon")]
{
- let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+ let mut hasher = crate::Hasher::new_keyed(TEST_KEY);
hasher.update_rayon(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+ assert_eq!(hasher.finalize(), expected_out[..32]);
assert_eq!(hasher.finalize(), test_out);
}
// xof
@@ -355,15 +152,15 @@ fn test_compare_reference_impl() {
// incremental
let mut hasher = crate::Hasher::new_derive_key(context);
hasher.update(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
- assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+ assert_eq!(hasher.finalize(), expected_out[..32]);
+ assert_eq!(hasher.finalize(), test_out[..32]);
// incremental (rayon)
#[cfg(feature = "rayon")]
{
let mut hasher = crate::Hasher::new_derive_key(context);
hasher.update_rayon(input);
- assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
- assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+ assert_eq!(hasher.finalize(), expected_out[..32]);
+ assert_eq!(hasher.finalize(), test_out[..32]);
}
// xof
let mut extended = [0; OUT];
@@ -423,9 +220,9 @@ fn test_fuzz_hasher() {
let mut input_buf = [0; 3 * INPUT_MAX];
paint_test_input(&mut input_buf);
- // Don't do too many iterations in debug mode, to keep the tests under a
- // second or so. CI should run tests in release mode also. Provide an
- // environment variable for specifying a larger number of fuzz iterations.
+ // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI
+ // should run tests in release mode also.
+ // TODO: Provide an environment variable for specifying a larger number of fuzz iterations?
let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 };
// Use a fixed RNG seed for reproducibility.
@@ -494,6 +291,133 @@ fn test_xof_seek() {
}
#[test]
+fn test_xof_xor() {
+ for step in [32, 63, 64, 128, 303] {
+ #[cfg(feature = "std")]
+ dbg!(step);
+ let mut ref_hasher = reference_impl::Hasher::new();
+ ref_hasher.update(b"foo");
+ let mut ref_output = [0u8; 1000];
+ ref_hasher.finalize(&mut ref_output);
+
+ let mut hasher = crate::Hasher::new();
+ hasher.update(b"foo");
+ let mut reader = hasher.finalize_xof();
+
+ let mut test_output = [0u8; 1000];
+ for chunk in test_output.chunks_mut(step) {
+ reader.fill(chunk);
+ }
+ assert_eq!(ref_output, test_output);
+ // Xor'ing the same output should zero the buffer.
+ reader.set_position(0);
+ for chunk in test_output.chunks_mut(step) {
+ reader.fill_xor(chunk);
+ }
+ assert_eq!([0u8; 1000], test_output);
+ // Xor'ing the same output again should reproduce the original.
+ reader.set_position(0);
+ for chunk in test_output.chunks_mut(step) {
+ reader.fill_xor(chunk);
+ }
+ assert_eq!(ref_output, test_output);
+
+ // Repeat the same test but starting at offset 500.
+ reader.set_position(500);
+ for chunk in test_output[..500].chunks_mut(step) {
+ reader.fill(chunk);
+ }
+ assert_eq!(ref_output[500..], test_output[..500]);
+ reader.set_position(500);
+ for chunk in test_output[..500].chunks_mut(step) {
+ reader.fill_xor(chunk);
+ }
+ assert_eq!([0u8; 500], test_output[..500]);
+ reader.set_position(500);
+ for chunk in test_output[..500].chunks_mut(step) {
+ reader.fill_xor(chunk);
+ }
+ assert_eq!(ref_output[500..], test_output[..500]);
+ }
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_fuzz_xof() {
+ // Use a fixed RNG seed for reproducibility.
+ let mut rng = rand_chacha::ChaCha8Rng::from_seed([99; 32]);
+ let random_key: [u8; 32] = rng.gen();
+
+ let possible_seeks = [-64i64, -63 - 1, 0, 1, 63, 64, 127, 128, 129];
+
+ const MAX_LEN: usize = 1100;
+ let possible_lengths = [0usize, 1, 63, 64, 65, 128, 256, 512, 1024, MAX_LEN];
+ assert!(possible_lengths.into_iter().all(|x| x <= MAX_LEN));
+
+ let mut xof_output = crate::Hasher::new_keyed(&random_key).finalize_xof();
+ let mut xof_xor_output = crate::Hasher::new_keyed(&random_key).finalize_xof();
+
+ // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI
+ // should run tests in release mode also.
+ // TODO: Provide an environment variable for specifying a larger number of fuzz iterations?
+ let num_tests = if cfg!(debug_assertions) {
+ 1_000
+ } else {
+ 100_000
+ };
+
+ let mut position = 0;
+ let mut ref_output = Vec::new();
+ for test_i in 0..num_tests {
+ eprintln!("--- test {test_i} ---");
+ // Do a random relative seek maybe. Could be zero.
+ let relative_seek: i64 = *possible_seeks.choose(&mut rng).unwrap();
+ dbg!(relative_seek);
+ if relative_seek != 0 {
+ let new_position = position as i64 + relative_seek;
+ if 0 <= new_position && new_position <= MAX_LEN as i64 {
+ position = new_position as u64;
+ } else {
+ position = 0;
+ }
+ assert_eq!(xof_output.position(), xof_xor_output.position());
+ xof_output.set_position(position as u64);
+ xof_xor_output.set_position(position as u64);
+ }
+ dbg!(position);
+
+ // Generate a random number of output bytes. If the amount of output we've gotten from the
+ // reference_impl isn't enough, double it.
+ let len: usize = *possible_lengths.choose(&mut rng).unwrap();
+ dbg!(len);
+ if position as usize + len > ref_output.len() {
+ let new_len = core::cmp::max(MAX_LEN, 2 * ref_output.len());
+ ref_output = vec![0u8; new_len];
+ eprintln!("grow reference output length to {}", ref_output.len());
+ let ref_hasher = reference_impl::Hasher::new_keyed(&random_key);
+ ref_hasher.finalize(&mut ref_output);
+ }
+ let mut buf = [0u8; MAX_LEN];
+ xof_output.fill(&mut buf[..len]);
+ assert_eq!(ref_output[position as usize..][..len], buf[..len]);
+ assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]);
+
+ // Xor over the output with a random byte value, and then confirm that xof_xor() recovers
+ // that value.
+ let random_byte: u8 = rng.gen();
+ dbg!(random_byte);
+ for i in 0..len {
+ buf[i] ^= random_byte;
+ }
+ xof_xor_output.fill_xor(&mut buf[..len]);
+ assert_eq!([random_byte; MAX_LEN][..len], buf[..len]);
+ assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]);
+
+ position += len as u64;
+ }
+}
+
+#[test]
fn test_msg_schedule_permutation() {
let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];
@@ -506,7 +430,7 @@ fn test_msg_schedule_permutation() {
}
}
- assert_eq!(generated, crate::MSG_SCHEDULE);
+ assert_eq!(generated, guts::MSG_SCHEDULE);
}
#[test]
@@ -640,53 +564,43 @@ fn test_zeroize() {
let mut hasher = crate::Hasher {
chunk_state: crate::ChunkState {
- cv: [42; 8],
+ cv: [42; 32],
chunk_counter: 42,
buf: [42; 64],
buf_len: 42,
blocks_compressed: 42,
flags: 42,
- platform: crate::Platform::Portable,
},
- key: [42; 8],
+ key: [42; 32],
cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(),
};
hasher.zeroize();
- assert_eq!(hasher.chunk_state.cv, [0; 8]);
+ assert_eq!(hasher.chunk_state.cv, [0; 32]);
assert_eq!(hasher.chunk_state.chunk_counter, 0);
assert_eq!(hasher.chunk_state.buf, [0; 64]);
assert_eq!(hasher.chunk_state.buf_len, 0);
assert_eq!(hasher.chunk_state.blocks_compressed, 0);
assert_eq!(hasher.chunk_state.flags, 0);
- assert!(matches!(
- hasher.chunk_state.platform,
- crate::Platform::Portable
- ));
- assert_eq!(hasher.key, [0; 8]);
+ assert_eq!(hasher.key, [0; 32]);
assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
let mut output_reader = crate::OutputReader {
inner: crate::Output {
- input_chaining_value: [42; 8],
+ input_chaining_value: [42; 32],
block: [42; 64],
counter: 42,
block_len: 42,
flags: 42,
- platform: crate::Platform::Portable,
},
position_within_block: 42,
};
output_reader.zeroize();
- assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
+ assert_eq!(output_reader.inner.input_chaining_value, [0; 32]);
assert_eq!(output_reader.inner.block, [0; 64]);
assert_eq!(output_reader.inner.counter, 0);
assert_eq!(output_reader.inner.block_len, 0);
assert_eq!(output_reader.inner.flags, 0);
- assert!(matches!(
- output_reader.inner.platform,
- crate::Platform::Portable
- ));
assert_eq!(output_reader.position_within_block, 0);
}
@@ -732,7 +646,7 @@ fn test_update_reader_interrupted() -> std::io::Result<()> {
self.already_interrupted = true;
return Err(io::Error::from(io::ErrorKind::Interrupted));
}
- let take = std::cmp::min(self.slice.len(), buf.len());
+ let take = core::cmp::min(self.slice.len(), buf.len());
buf[..take].copy_from_slice(&self.slice[..take]);
self.slice = &self.slice[take..];
Ok(take)
diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml
index 87a9eba..21098d4 100644
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@@ -12,6 +12,7 @@ pure = ["blake3/pure"]
# If you ever change these path dependencies, you'll probably need to update
# cross_test.sh, or CI will break. I'm sorry >.<
blake3 = { path = "../" }
+blake3_guts = { path = "../rust/guts" }
hex = "0.4.0"
reference_impl = { path = "../reference_impl" }
serde = { version = "1.0", features = ["derive"] }
diff --git a/test_vectors/cross_test.sh b/test_vectors/cross_test.sh
index c4d280c..a08178a 100755
--- a/test_vectors/cross_test.sh
+++ b/test_vectors/cross_test.sh
@@ -20,6 +20,7 @@ mv blake3/reference_impl test_vectors
mv blake3 test_vectors
cd test_vectors
sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
-sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
+sed -i 's|blake3_guts = { path = "../rust/guts" }|blake3_guts = { path = "./blake3/rust/guts" }|' Cargo.toml
+sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "./reference_impl" }|' Cargo.toml
cross test "$@"
diff --git a/test_vectors/src/lib.rs b/test_vectors/src/lib.rs
index 6a4c798..ea2491a 100644
--- a/test_vectors/src/lib.rs
+++ b/test_vectors/src/lib.rs
@@ -1,4 +1,4 @@
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
+use blake3_guts::{BLOCK_LEN, CHUNK_LEN};
use serde::{Deserialize, Serialize};
// A non-multiple of 4 is important, since one possible bug is to fail to emit