aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2020-02-11 14:13:30 -0500
committerJack O'Connor <[email protected]>2020-02-12 10:23:17 -0500
commitefbfa0463c793dc1319db10ca4e3b809937b227d (patch)
treeb643427eb38da8dc9b6548814e7e34966b604791
parentb6b3c27824e665a73f77fd147da2052efff0ab8a (diff)
integrate assembly implementations into the blake3 crate
-rw-r--r--.github/workflows/ci.yml26
-rw-r--r--Cargo.toml19
-rw-r--r--README.md20
-rw-r--r--b3sum/Cargo.toml4
-rw-r--r--benches/bench.rs162
-rw-r--r--build.rs107
-rw-r--r--src/c_avx2.rs63
-rw-r--r--src/c_avx512.rs3
-rw-r--r--src/c_neon.rs2
-rw-r--r--src/c_sse41.rs114
-rw-r--r--src/lib.rs38
-rw-r--r--src/platform.rs89
-rw-r--r--src/rust_avx2.rs (renamed from src/avx2.rs)0
-rw-r--r--src/rust_sse41.rs (renamed from src/sse41.rs)0
-rw-r--r--test_vectors/Cargo.toml8
-rwxr-xr-xtest_vectors/cross_test.sh2
16 files changed, 465 insertions, 192 deletions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e3da4e5..db7decd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,22 +24,30 @@ jobs:
toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
profile: minimal
override: true
- # Default tests.
- - run: cargo test
- # No-default-features tests.
+ # Default tests plus Rayon.
+ - run: cargo test --features=rayon
+ # no_std tests.
- run: cargo test --no-default-features
- # More features tests. Note that "c_avx512" participates in dynamic feature
- # detection, so it'll be built, but it probably won't run.
- - run: cargo test --features=c_avx512,rayon
+ # Test the x86 assembly implementations. Use -vv to log compiler commands.
+ - run: cargo test --features=c -vv
+ # Test the C intrinsics implementations. Use -vv to log compiler commands.
+ - run: cargo test --features=c,c_prefer_intrinsics -vv
# Test release mode. This does more iteratations in test_fuzz_hasher.
- run: cargo test --release
- # Test benchmarks. Nightly only.
- - run: cargo test --benches
- if: matrix.rust_version == 'nightly'
+ # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
+ - run: cargo test --benches --features=c
+ env:
+ RUSTC_BOOTSTRAP: 1
# Test vectors.
- name: test vectors
run: cargo test
working-directory: ./test_vectors
+ - name: test vectors
+ run: cargo test --features=c
+ working-directory: ./test_vectors
+ - name: test vectors
+ run: cargo test --features=c,c_prefer_intrinsics
+ working-directory: ./test_vectors
# Test b3sum.
- name: test b3sum
run: cargo test
diff --git a/Cargo.toml b/Cargo.toml
index 4d8e7cf..1a659ef 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,10 +11,21 @@ edition = "2018"
[features]
default = ["std"]
-# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU
-# feature detection. A binary with "c_avx512" on is still cross-platform. This
-# feature has no effect on non-x86.
-c_avx512 = []
+# The "c" feature includes C and assembly SIMD implementations of the
+# compression function for x86 platforms, called via FFI. (Currently it has no
+# effect on other platforms.) This requires a C toolchain on the build machine.
+# This is necessary for AVX-512 support, which is not yet stable in Rust, and
+# the assembly implementations also perform better than those using Rust/LLVM
+# intrinsics. As with the Rust implementations, these C and assembly
+# implementations participate in runtime CPU feature detection, and the
+# resulting binary is portable.
+c = []
+# Normally x86-64 builds prefer assembly implementations over C intrinsics. The
+# assembly implementations perform better, perform most consistently across
+# compilers, and are much faster to build. However, this feature makes the
+# build use the C intrinsics implementations instead. This is mainly for
+# testing purposes, and most callers will not want to use it.
+c_prefer_intrinsics = []
# The NEON implementation does not participate in dynamic feature detection,
# which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note
# that AArch64 always supports NEON, but support on ARMv7 varies.
diff --git a/README.md b/README.md
index 8f881dd..a8ad4c7 100644
--- a/README.md
+++ b/README.md
@@ -33,19 +33,18 @@ with BLAKE3.
This repository is the official implementation of BLAKE3. It includes:
* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
- includes optimized SIMD implementations, with dynamic CPU feature
- detection on x86. SSE4.1 and AVX2 support are implemented in Rust,
- while AVX-512 and ARM NEON support are imported from the C
- implementation and controlled by the `c_avx512` and `c_neon` features.
- Multi-threading is implemented with
- [Rayon](https://github.com/rayon-rs/rayon) and controlled by the
- `rayon` feature.
+ includes optimized SIMD implementations, with runtime CPU feature
+ detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c`
+ feature enables C/assembly implementations and AVX-512 support. The
+ `c_neon` feature enables ARM NEON support. Multi-threading is also
+ supported, and the `rayon` feature provides a
+ [Rayon](https://github.com/rayon-rs/rayon)-based implementation.
* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
provides a command line interface. You can install it from
[crates.io](https://crates.io/crates/b3sum) with `cargo install
- b3sum`. It enables the multi-threading and AVX-512 features of the
- `blake3` crate by default.
+ b3sum`. It enables the `rayon` and `c` features of the `blake3` crate
+ by default.
* The [C implementation](c), which like the Rust implementation includes
SIMD code and dynamic CPU feature detection on x86. Unlike the Rust
@@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
## Usage
-This repository provides the `b3sum` command line utility and the
-`blake3` Rust crate.
-
### The `b3sum` utility
The `b3sum` utility allows you to process files and data from standard
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index c4c8068..aaa23e9 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -9,8 +9,8 @@ readme = "README.md"
edition = "2018"
[features]
-default = ["c_avx512", "rayon"]
-c_avx512 = ["blake3/c_avx512"]
+default = ["c", "rayon"]
+c = ["blake3/c"]
c_neon = ["blake3/c_neon"]
rayon = ["blake3/rayon", "memmap"]
diff --git a/benches/bench.rs b/benches/bench.rs
index 0d73970..70be967 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -4,7 +4,7 @@ extern crate test;
use arrayref::array_ref;
use arrayvec::ArrayVec;
-use blake3::platform::MAX_SIMD_DEGREE;
+use blake3::platform::{Platform, MAX_SIMD_DEGREE};
use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
use rand::prelude::*;
use test::Bencher;
@@ -48,173 +48,149 @@ impl RandomInput {
}
}
-type CompressInPlaceFn =
- unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
-
-fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
+fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
let mut state = [1u32; 8];
let mut r = RandomInput::new(b, 64);
let input = array_ref!(r.get(), 0, 64);
- unsafe {
- b.iter(|| f(&mut state, input, 64 as u8, 0, 0));
- }
+ b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
}
#[bench]
fn bench_single_compression_portable(b: &mut Bencher) {
- bench_single_compression_fn(b, blake3::portable::compress_in_place);
+ bench_single_compression_fn(b, Platform::portable());
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse41(b: &mut Bencher) {
- if !blake3::platform::sse41_detected() {
- return;
+ if let Some(platform) = Platform::sse41() {
+ bench_single_compression_fn(b, platform);
}
- bench_single_compression_fn(b, blake3::sse41::compress_in_place);
}
#[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
fn bench_single_compression_avx512(b: &mut Bencher) {
- if !blake3::platform::avx512_detected() {
- return;
+ if let Some(platform) = Platform::avx512() {
+ bench_single_compression_fn(b, platform);
}
- bench_single_compression_fn(b, blake3::c_avx512::compress_in_place);
}
-type HashManyFn<A> = unsafe fn(
- inputs: &[&A],
- key: &[u32; 8],
- counter: u64,
- increment_counter: blake3::IncrementCounter,
- flags: u8,
- flags_start: u8,
- flags_end: u8,
- out: &mut [u8],
-);
-
-fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) {
+fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
+ let degree = platform.simd_degree();
let mut inputs = Vec::new();
for _ in 0..degree {
inputs.push(RandomInput::new(b, CHUNK_LEN));
}
- unsafe {
- b.iter(|| {
- let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- f(
- &input_arrays[..],
- &[0; 8],
- 0,
- blake3::IncrementCounter::Yes,
- 0,
- 0,
- 0,
- &mut out,
- );
- });
- }
+ b.iter(|| {
+ let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
+ .iter_mut()
+ .take(degree)
+ .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
+ .collect();
+ let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+ platform.hash_many(
+ &input_arrays[..],
+ &[0; 8],
+ 0,
+ blake3::IncrementCounter::Yes,
+ 0,
+ 0,
+ 0,
+ &mut out,
+ );
+ });
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse41(b: &mut Bencher) {
- if !blake3::platform::sse41_detected() {
- return;
+ if let Some(platform) = Platform::sse41() {
+ bench_many_chunks_fn(b, platform);
}
- bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_avx2(b: &mut Bencher) {
- if !blake3::platform::avx2_detected() {
- return;
+ if let Some(platform) = Platform::avx2() {
+ bench_many_chunks_fn(b, platform);
}
- bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
}
#[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
fn bench_many_chunks_avx512(b: &mut Bencher) {
- if !blake3::platform::avx512_detected() {
- return;
+ if let Some(platform) = Platform::avx512() {
+ bench_many_chunks_fn(b, platform);
}
- bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
}
#[bench]
#[cfg(feature = "c_neon")]
fn bench_many_chunks_neon(b: &mut Bencher) {
- // When "c_neon" is on, NEON support is assumed.
- bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
+ if let Some(platform) = Platform::neon() {
+ bench_many_chunks_fn(b, platform);
+ }
}
// TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) {
+fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
+ let degree = platform.simd_degree();
let mut inputs = Vec::new();
for _ in 0..degree {
inputs.push(RandomInput::new(b, BLOCK_LEN));
}
- unsafe {
- b.iter(|| {
- let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
- .iter_mut()
- .take(degree)
- .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
- .collect();
- let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
- f(
- &input_arrays[..],
- &[0; 8],
- 0,
- blake3::IncrementCounter::No,
- 0,
- 0,
- 0,
- &mut out,
- );
- });
- }
+ b.iter(|| {
+ let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
+ .iter_mut()
+ .take(degree)
+ .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
+ .collect();
+ let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+ platform.hash_many(
+ &input_arrays[..],
+ &[0; 8],
+ 0,
+ blake3::IncrementCounter::No,
+ 0,
+ 0,
+ 0,
+ &mut out,
+ );
+ });
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse41(b: &mut Bencher) {
- if !blake3::platform::sse41_detected() {
- return;
+ if let Some(platform) = Platform::sse41() {
+ bench_many_parents_fn(b, platform);
}
- bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_avx2(b: &mut Bencher) {
- if !blake3::platform::avx2_detected() {
- return;
+ if let Some(platform) = Platform::avx2() {
+ bench_many_parents_fn(b, platform);
}
- bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
}
#[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
fn bench_many_parents_avx512(b: &mut Bencher) {
- if !blake3::platform::avx512_detected() {
- return;
+ if let Some(platform) = Platform::avx512() {
+ bench_many_parents_fn(b, platform);
}
- bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
}
#[bench]
#[cfg(feature = "c_neon")]
fn bench_many_parents_neon(b: &mut Bencher) {
- // When "c_neon" is on, NEON support is assumed.
- bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
+ if let Some(platform) = Platform::neon() {
+ bench_many_parents_fn(b, platform);
+ }
}
fn bench_atonce(b: &mut Bencher, len: usize) {
diff --git a/build.rs b/build.rs
index 67fe3fc..c5a662d 100644
--- a/build.rs
+++ b/build.rs
@@ -13,6 +13,11 @@ fn is_x86_64() -> bool {
target_components()[0] == "x86_64"
}
+fn is_x86_32() -> bool {
+ let arch = &target_components()[0];
+ arch == "i386" || arch == "i586" || arch == "i686"
+}
+
fn is_armv7() -> bool {
target_components()[0] == "armv7"
}
@@ -28,6 +33,13 @@ fn is_windows_msvc() -> bool {
&& target_components()[3] == "msvc"
}
+fn is_windows_gnu() -> bool {
+ // Some targets are only two components long, so check in steps.
+ target_components()[1] == "pc"
+ && target_components()[2] == "windows"
+ && target_components()[3] == "gnu"
+}
+
fn new_build() -> cc::Build {
let mut build = cc::Build::new();
if !is_windows_msvc() {
@@ -37,16 +49,16 @@ fn new_build() -> cc::Build {
}
const WINDOWS_MSVC_ERROR: &str = r#"
-The "c_avx512" feature is enabled, but your version of the MSVC C compiler does
-not support the "/arch:AVX512" flag. If you are building the "b3sum" or
-"bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features"
-flag. (Note that this also disables other default features like Rayon-based
+The "c" feature is enabled, but your version of the MSVC C compiler does not
+support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin"
+crates, you can disable AVX-512 with Cargo's "--no-default-features" flag.
+(Note that this also disables other default features like Rayon-based
multithreading, which you can re-enable with "--features=rayon".) Other crates
might or might not support this workaround.
"#;
const GNU_ERROR: &str = r#"
-The "c_avx512" feature is enabled, but your C compiler does not support the
+The "c" feature is enabled, but your C compiler does not support the
"-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can
disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also
disables other default features like Rayon-based multithreading, which you can
@@ -69,25 +81,76 @@ fn check_for_avx512_compiler_support(build: &cc::Build) {
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
- // "c_avx512' is a no-op for non-x86_64 targets. It also participates in
- // dynamic CPU feature detection, so it's generally safe to enable.
- // However, it probably won't build in some older environments without
- // AVX-512 support in the C compiler, and it's disabled by default for that
- // reason.
- if defined("CARGO_FEATURE_C_AVX512") && is_x86_64() {
- let mut build = new_build();
- check_for_avx512_compiler_support(&build);
- build.file("c/blake3_avx512.c");
- if is_windows_msvc() {
- // Note that a lot of versions of MSVC don't support /arch:AVX512,
- // and they'll discard it with a warning, hopefully leading to a
- // build error.
- build.flag("/arch:AVX512");
+ if defined("CARGO_FEATURE_C") {
+ if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") {
+ // On 64-bit, use the assembly implementations, unless the
+ // "c_prefer_intrinsics" feature is enabled.
+ if is_windows_msvc() {
+ let mut build = new_build();
+ build.file("c/blake3-sse41-x86_64-windows-msvc.asm");
+ build.file("c/blake3-avx2-x86_64-windows-msvc.asm");
+ build.file("c/blake3-avx512-x86_64-windows-msvc.asm");
+ build.compile("blake3_asm");
+ } else if is_windows_gnu() {
+ let mut build = new_build();
+ build.file("c/blake3-sse41-x86_64-windows-gnu.S");
+ build.file("c/blake3-avx2-x86_64-windows-gnu.S");
+ build.file("c/blake3-avx512-x86_64-windows-gnu.S");
+ build.compile("blake3_asm");
+ } else {
+ // All non-Windows implementations are assumed to support
+ // Linux-style assembly. These files do contain a small
+ // explicit workaround for macOS also.
+ let mut build = new_build();
+ build.file("c/blake3-sse41-x86_64-unix.S");
+ build.file("c/blake3-avx2-x86_64-unix.S");
+ build.file("c/blake3-avx512-x86_64-unix.S");
+ build.compile("blake3_asm");
+ }
+ } else if is_x86_64() || is_x86_32() {
+ // Assembly implementations are only for 64-bit. On 32-bit, or if
+ // the "c_prefer_intrinsics" feature is enabled, use the
+ // intrinsics-based C implementations. These each need to be
+ // compiled separately, with the corresponding instruction set
+ // extension explicitly enabled in the compiler.
+
+ let mut sse41_build = new_build();
+ sse41_build.file("c/blake3_sse41.c");
+ if is_windows_msvc() {
+ // /arch:SSE2 is the default on x86 and undefined on x86_64:
+ // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
+ // It also includes SSE4.1 intrisincs:
+ // https://stackoverflow.com/a/32183222/823869
+ } else {
+ sse41_build.flag("-msse4.1");
+ }
+ sse41_build.compile("blake3_sse41");
+
+ let mut avx2_build = new_build();
+ avx2_build.file("c/blake3_avx2.c");
+ if is_windows_msvc() {
+ avx2_build.flag("/arch:AVX2");
+ } else {
+ avx2_build.flag("-mavx2");
+ }
+ avx2_build.compile("blake3_avx2");
+
+ let mut avx512_build = new_build();
+ check_for_avx512_compiler_support(&avx512_build);
+ avx512_build.file("c/blake3_avx512.c");
+ if is_windows_msvc() {
+ // Note that a lot of versions of MSVC don't support /arch:AVX512,
+ // and they'll discard it with a warning, hopefully leading to a
+ // build error.
+ avx512_build.flag("/arch:AVX512");
+ } else {
+ avx512_build.flag("-mavx512f");
+ avx512_build.flag("-mavx512vl");
+ }
+ avx512_build.compile("blake3_avx512");
} else {
- build.flag("-mavx512f");
- build.flag("-mavx512vl");
+ // Currently no effect for non-x86 platforms.
}
- build.compile("blake3_avx512");
}
if defined("CARGO_FEATURE_C_NEON") {
diff --git a/src/c_avx2.rs b/src/c_avx2.rs
new file mode 100644
index 0000000..d805e86
--- /dev/null
+++ b/src/c_avx2.rs
@@ -0,0 +1,63 @@
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
+
+// Note that there is no AVX2 implementation of compress_in_place or
+// compress_xof.
+
+// Unsafe because this may only be called on platforms supporting AVX2.
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+ inputs: &[&A],
+ key: &CVWords,
+ counter: u64,
+ increment_counter: IncrementCounter,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: &mut [u8],
+) {
+ // The Rust hash_many implementations do bounds checking on the `out`
+ // array, but the C implementations don't. Even though this is an unsafe
+ // function, assert the bounds here.
+ assert!(out.len() >= inputs.len() * OUT_LEN);
+ ffi::blake3_hash_many_avx2(
+ inputs.as_ptr() as *const *const u8,
+ inputs.len(),
+ A::CAPACITY / BLOCK_LEN,
+ key.as_ptr(),
+ counter,
+ increment_counter.yes(),
+ flags,
+ flags_start,
+ flags_end,
+ out.as_mut_ptr(),
+ )
+}
+
+pub mod ffi {
+ extern "C" {
+ pub fn blake3_hash_many_avx2(
+ inputs: *const *const u8,
+ num_inputs: usize,
+ blocks: usize,
+ key: *const u32,
+ counter: u64,
+ increment_counter: bool,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_hash_many() {
+ if !crate::platform::avx2_detected() {
+ return;
+ }
+ crate::test::test_hash_many_fn(hash_many, hash_many);
+ }
+}
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
index f20de2c..c1b9f64 100644
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@@ -1,7 +1,5 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-pub const DEGREE: usize = 16;
-
// Unsafe because this may only be called on platforms supporting AVX-512.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
@@ -91,7 +89,6 @@ pub mod ffi {
flags_end: u8,
out: *mut u8,
);
-
}
}
diff --git a/src/c_neon.rs b/src/c_neon.rs
index 34ef074..77b9654 100644
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@@ -1,7 +1,5 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
-pub const DEGREE: usize = 4;
-
// Unsafe because this may only be called on platforms supporting NEON.
pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
diff --git a/src/c_sse41.rs b/src/c_sse41.rs
new file mode 100644
index 0000000..0b64c90
--- /dev/null
+++ b/src/c_sse41.rs
@@ -0,0 +1,114 @@
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
+
+// Unsafe because this may only be called on platforms supporting SSE4.1.
+pub unsafe fn compress_in_place(
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ counter: u64,
+ flags: u8,
+) {
+ ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
+}
+
+// Unsafe because this may only be called on platforms supporting SSE4.1.
+pub unsafe fn compress_xof(
+ cv: &CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ counter: u64,
+ flags: u8,
+) -> [u8; 64] {
+ let mut out = [0u8; 64];
+ ffi::blake3_compress_xof_sse41(
+ cv.as_ptr(),
+ block.as_ptr(),
+ block_len,
+ counter,
+ flags,
+ out.as_mut_ptr(),
+ );
+ out
+}
+
+// Unsafe because this may only be called on platforms supporting SSE4.1.
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+ inputs: &[&A],
+ key: &CVWords,
+ counter: u64,
+ increment_counter: IncrementCounter,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: &mut [u8],
+) {
+ // The Rust hash_many implementations do bounds checking on the `out`
+ // array, but the C implementations don't. Even though this is an unsafe
+ // function, assert the bounds here.
+ assert!(out.len() >= inputs.len() * OUT_LEN);
+ ffi::blake3_hash_many_sse41(
+ inputs.as_ptr() as *const *const u8,
+ inputs.len(),
+ A::CAPACITY / BLOCK_LEN,
+ key.as_ptr(),
+ counter,
+ increment_counter.yes(),
+ flags,
+ flags_start,
+ flags_end,
+ out.as_mut_ptr(),
+ )
+}
+
+pub mod ffi {
+ extern "C" {
+ pub fn blake3_compress_in_place_sse41(
+ cv: *mut u32,
+ block: *const u8,
+ block_len: u8,
+ counter: u64,
+ flags: u8,
+ );
+ pub fn blake3_compress_xof_sse41(
+ cv: *const u32,
+ block: *const u8,
+ block_len: u8,
+ counter: u64,
+ flags: u8,
+ out: *mut u8,
+ );
+ pub fn blake3_hash_many_sse41(
+ inputs: *const *const u8,
+ num_inputs: usize,
+ blocks: usize,
+ key: *const u32,
+ counter: u64,
+ increment_counter: bool,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_compress() {
+ if !crate::platform::sse41_detected() {
+ return;
+ }
+ crate::test::test_compress_fn(compress_in_place, compress_xof);
+ }
+
+ #[test]
+ fn test_hash_many() {
+ if !crate::platform::sse41_detected() {
+ return;
+ }
+ crate::test::test_hash_many_fn(hash_many, hash_many);
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7fa3510..58d2dbe 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -39,24 +39,32 @@ mod test;
#[doc(hidden)]
pub mod guts;
-// These modules are pub for benchmarks only. They are not stable.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
-pub mod avx2;
-#[cfg(feature = "c_avx512")]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
-pub mod c_avx512;
-#[cfg(feature = "c_neon")]
-#[doc(hidden)]
-pub mod c_neon;
+// The platform module is pub for benchmarks only. It is not stable.
#[doc(hidden)]
pub mod platform;
-#[doc(hidden)]
-pub mod portable;
+
+// Platform-specific implementations of the compression function.
+mod portable;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
-pub mod sse41;
+cfg_if::cfg_if! {
+ if #[cfg(feature = "c")] {
+ #[path = "c_sse41.rs"]
+ mod sse41;
+ #[path = "c_avx2.rs"]
+ mod avx2;
+ #[path = "c_avx512.rs"]
+ mod avx512;
+ } else {
+ #[path = "rust_sse41.rs"]
+ mod sse41;
+ #[path = "rust_avx2.rs"]
+ mod avx2;
+ // Stable Rust does not currently support AVX-512.
+ }
+}
+#[cfg(feature = "c_neon")]
+#[path = "c_neon.rs"]
+mod neon;
pub mod traits;
diff --git a/src/platform.rs b/src/platform.rs
index b453a6e..163cbbb 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,18 +1,10 @@
use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
use arrayref::{array_mut_ref, array_ref};
-#[cfg(feature = "c_avx512")]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::c_avx512;
-#[cfg(feature = "c_neon")]
-use crate::c_neon;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::{avx2, sse41};
-
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! {
- if #[cfg(feature = "c_avx512")] {
+ if #[cfg(feature = "c")] {
pub const MAX_SIMD_DEGREE: usize = 16;
} else {
pub const MAX_SIMD_DEGREE: usize = 8;
@@ -32,7 +24,7 @@ cfg_if::cfg_if! {
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! {
- if #[cfg(feature = "c_avx512")] {
+ if #[cfg(feature = "c")] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
} else {
pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
@@ -52,7 +44,7 @@ pub enum Platform {
SSE41,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX2,
- #[cfg(feature = "c_avx512")]
+ #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX512,
#[cfg(feature = "c_neon")]
@@ -64,7 +56,7 @@ impl Platform {
pub fn detect() -> Self {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
- #[cfg(feature = "c_avx512")]
+ #[cfg(feature = "c")]
{
if avx512_detected() {
return Platform::AVX512;
@@ -93,7 +85,7 @@ impl Platform {
Platform::SSE41 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => 8,
- #[cfg(feature = "c_avx512")]
+ #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => 16,
#[cfg(feature = "c_neon")]
@@ -103,7 +95,7 @@ impl Platform {
degree
}
- pub(crate) fn compress_in_place(
+ pub fn compress_in_place(
&self,
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
@@ -116,13 +108,13 @@ impl Platform {
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
- sse41::compress_in_place(cv, block, block_len, counter, flags)
+ crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
- #[cfg(feature = "c_avx512")]
+ #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
- c_avx512::compress_in_place(cv, block, block_len, counter, flags)
+ crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
},
// No NEON compress_in_place() implementation yet.
#[cfg(feature = "c_neon")]
@@ -130,7 +122,7 @@ impl Platform {
}
}
- pub(crate) fn compress_xof(
+ pub fn compress_xof(
&self,
cv: &CVWords,
block: &[u8; BLOCK_LEN],
@@ -143,13 +135,13 @@ impl Platform {
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
- sse41::compress_xof(cv, block, block_len, counter, flags)
+ crate::sse41::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
- #[cfg(feature = "c_avx512")]
+ #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
- c_avx512::compress_xof(cv, block, block_len, counter, flags)
+ crate::avx512::compress_xof(cv, block, block_len, counter, flags)
},
// No NEON compress_xof() implementation yet.
#[cfg(feature = "c_neon")]
@@ -167,7 +159,7 @@ impl Platform {
// after every block, there's a small but measurable performance loss.
// Compressing chunks with a dedicated loop avoids this.
- pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>(
+ pub fn hash_many<A: arrayvec::Array<Item = u8>>(
&self,
inputs: &[&A],
key: &CVWords,
@@ -192,7 +184,7 @@ impl Platform {
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => unsafe {
- sse41::hash_many(
+ crate::sse41::hash_many(
inputs,
key,
counter,
@@ -206,7 +198,7 @@ impl Platform {
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => unsafe {
- avx2::hash_many(
+ crate::avx2::hash_many(
inputs,
key,
counter,
@@ -218,10 +210,10 @@ impl Platform {
)
},
// Safe because detect() checked for platform support.
- #[cfg(feature = "c_avx512")]
+ #[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
- c_avx512::hash_many(
+ crate::avx512::hash_many(
inputs,
key,
counter,
@@ -235,7 +227,7 @@ impl Platform {
// Assumed to be safe if the "c_neon" feature is on.
#[cfg(feature = "c_neon")]
Platform::NEON => unsafe {
- c_neon::hash_many(
+ crate::neon::hash_many(
inputs,
key,
counter,
@@ -248,11 +240,52 @@ impl Platform {
},
}
}
+
+ // Explicit platform constructors, for benchmarks.
+
+ pub fn portable() -> Self {
+ Self::Portable
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn sse41() -> Option<Self> {
+ if sse41_detected() {
+ Some(Self::SSE41)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn avx2() -> Option<Self> {
+ if avx2_detected() {
+ Some(Self::AVX2)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(feature = "c")]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn avx512() -> Option<Self> {
+ if avx512_detected() {
+ Some(Self::AVX512)
+ } else {
+ None
+ }
+ }
+
+ #[cfg(feature = "c_neon")]
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ pub fn neon() -> Option<Self> {
+ // Assumed to be safe if the "c_neon" feature is on.
+ Some(Self::NEON)
+ }
}
// Note that AVX-512 is divided into multiple featuresets, and we use two of
// them, F and VL.
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
pub fn avx512_detected() -> bool {
diff --git a/src/avx2.rs b/src/rust_avx2.rs
index 7f36072..7f36072 100644
--- a/src/avx2.rs
+++ b/src/rust_avx2.rs
diff --git a/src/sse41.rs b/src/rust_sse41.rs
index fcf2f98..fcf2f98 100644
--- a/src/sse41.rs
+++ b/src/rust_sse41.rs
diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml
index 007d1c8..2a90e39 100644
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@@ -3,10 +3,16 @@ name = "test_vectors"
version = "0.0.0"
edition = "2018"
+[features]
+default = []
+c = ["blake3/c"]
+c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"]
+c_neon = ["blake3/c_neon"]
+
[dependencies]
# If you ever change these path dependencies, you'll probably need to update
# cross_test.sh, or CI will break. I'm sorry >.<
-blake3 = { path = "../", features=["c_avx512"] }
+blake3 = { path = "../" }
hex = "0.4.0"
reference_impl = { path = "../reference_impl" }
serde = { version = "1.0", features = ["derive"] }
diff --git a/test_vectors/cross_test.sh b/test_vectors/cross_test.sh
index 1f6a34b..c4d280c 100755
--- a/test_vectors/cross_test.sh
+++ b/test_vectors/cross_test.sh
@@ -19,7 +19,7 @@ mv blake3/test_vectors .
mv blake3/reference_impl test_vectors
mv blake3 test_vectors
cd test_vectors
-sed -i 's|blake3 = { path = "../", features=\["c_avx512"\] }|blake3 = { path = "./blake3" }|' Cargo.toml
+sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
cross test "$@"