aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.toml6
-rw-r--r--build.rs65
-rw-r--r--src/c_avx512.rs109
-rw-r--r--src/c_neon.rs59
-rw-r--r--src/lib.rs6
-rw-r--r--src/platform.rs108
6 files changed, 343 insertions, 10 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 61d4f91..756703c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,8 +5,9 @@ authors = ["Jack O'Connor <[email protected]>"]
edition = "2018"
[features]
-default = ["c", "std"]
-c = []
+default = ["std"]
+c_avx512 = []
+c_neon = []
std = []
[dependencies]
@@ -14,6 +15,7 @@ arrayref = "0.3.5"
arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] }
constant_time_eq = "0.1.4"
rayon = { version = "1.2.1", optional = true }
+cfg-if = "0.1.10"
[dev-dependencies]
page_size = "0.4.1"
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..f710508
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,65 @@
+use std::env;
+
+fn defined(var: &str) -> bool {
+ env::var_os(var).is_some()
+}
+
+fn is_windows() -> bool {
+ let target = env::var("TARGET").unwrap();
+ let target_components: Vec<&str> = target.split("-").collect();
+ let target_os = target_components[2];
+ target_os == "windows"
+}
+
+fn new_build() -> cc::Build {
+ let mut build = cc::Build::new();
+ if !is_windows() {
+ build.flag("-std=c11");
+ }
+ build
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+ if defined("CARGO_FEATURE_C_AVX512") {
+ let mut build = new_build();
+ build.file("src/c/blake3_avx512.c");
+ if is_windows() {
+ // Note that a lot of versions of MSVC don't support /arch:AVX512,
+ // and they'll discard it with a warning, hopefully leading to a
+ // build error.
+ build.flag("/arch:AVX512");
+ } else {
+ build.flag("-mavx512f");
+ build.flag("-mavx512vl");
+ }
+ build.compile("blake3_avx512");
+ }
+
+ if defined("CARGO_FEATURE_C_NEON") {
+ let mut build = new_build();
+ build.file("src/c/blake3_neon.c");
+ build.file("src/c/blake3_portable.c");
+ // ARMv7 platforms that support NEON generally need the following
+ // flags. AArch64 supports NEON by default and does not support -mpfu.
+ // build.flag("-mfpu=neon-vfpv4");
+ // build.flag("-mfloat-abi=hard");
+ build.compile("blake3_neon");
+ }
+
+ // The `cc` crate does not automatically emit rerun-if directives for the
+ // environment variables it supports, in particular for $CC. We expect to
+ // do a lot of benchmarking across different compilers, so we explicitly
+ // add the variables that we're likely to need.
+ println!("cargo:rerun-if-env-changed=CC");
+ println!("cargo:rerun-if-env-changed=CFLAGS");
+
+ // Ditto for source files, though these shouldn't change as often.
+ for file in std::fs::read_dir("src/c")? {
+ println!(
+ "cargo:rerun-if-changed={}",
+ file?.path().to_str().expect("utf-8")
+ );
+ }
+
+ Ok(())
+}
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
new file mode 100644
index 0000000..f27c33a
--- /dev/null
+++ b/src/c_avx512.rs
@@ -0,0 +1,109 @@
+use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN};
+
+// Unsafe because this may only be called on platforms supporting AVX-512.
+pub unsafe fn compress(
+ cv: &[u8; 32],
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) -> [u8; 64] {
+ let mut out = [0u8; 64];
+ ffi::blake3_compress_avx512(
+ cv.as_ptr(),
+ block.as_ptr(),
+ block_len,
+ offset,
+ flags,
+ out.as_mut_ptr(),
+ );
+ out
+}
+
+// Unsafe because this may only be called on platforms supporting AVX-512.
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+ inputs: &[&A],
+ key: &[u8; KEY_LEN],
+ offset: u64,
+ offset_deltas: &OffsetDeltas,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: &mut [u8],
+) {
+ // The Rust hash_many implementations do bounds checking on the `out`
+ // array, but the C implementations don't. Even though this is an unsafe
+ // function, assert the bounds here.
+ assert!(out.len() >= inputs.len() * OUT_LEN);
+ ffi::blake3_hash_many_avx512(
+ inputs.as_ptr() as *const *const u8,
+ inputs.len(),
+ A::CAPACITY / BLOCK_LEN,
+ key.as_ptr(),
+ offset,
+ offset_deltas.as_ptr(),
+ flags,
+ flags_start,
+ flags_end,
+ out.as_mut_ptr(),
+ )
+}
+
+pub mod ffi {
+ extern "C" {
+ pub fn blake3_compress_avx512(
+ cv: *const u8,
+ block: *const u8,
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+ out: *mut u8,
+ );
+ // hash4/hash8/hash16 are exposed here for benchmarks.
+ pub fn blake3_hash4_avx512(
+ inputs: *const *const u8,
+ blocks: usize,
+ key: *const u8,
+ offset: u64,
+ offset_deltas: *const u64,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ pub fn blake3_hash8_avx512(
+ inputs: *const *const u8,
+ blocks: usize,
+ key: *const u8,
+ offset: u64,
+ offset_deltas: *const u64,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ pub fn blake3_hash16_avx512(
+ inputs: *const *const u8,
+ blocks: usize,
+ key: *const u8,
+ offset: u64,
+ offset_deltas: *const u64,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ pub fn blake3_hash_many_avx512(
+ inputs: *const *const u8,
+ num_inputs: usize,
+ blocks: usize,
+ key: *const u8,
+ offset: u64,
+ offset_deltas: *const u64,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ }
+}
diff --git a/src/c_neon.rs b/src/c_neon.rs
new file mode 100644
index 0000000..de55aa7
--- /dev/null
+++ b/src/c_neon.rs
@@ -0,0 +1,59 @@
+use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN};
+
+// Unsafe because this may only be called on platforms supporting NEON.
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+ inputs: &[&A],
+ key: &[u8; KEY_LEN],
+ offset: u64,
+ offset_deltas: &OffsetDeltas,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: &mut [u8],
+) {
+ // The Rust hash_many implementations do bounds checking on the `out`
+ // array, but the C implementations don't. Even though this is an unsafe
+ // function, assert the bounds here.
+ assert!(out.len() >= inputs.len() * OUT_LEN);
+ ffi::blake3_hash_many_neon(
+ inputs.as_ptr() as *const *const u8,
+ inputs.len(),
+ A::CAPACITY / BLOCK_LEN,
+ key.as_ptr(),
+ offset,
+ offset_deltas.as_ptr(),
+ flags,
+ flags_start,
+ flags_end,
+ out.as_mut_ptr(),
+ )
+}
+
+pub mod ffi {
+ extern "C" {
+ // Exposed here for benchmarks.
+ pub fn blake3_hash4_neon(
+ inputs: *const *const u8,
+ blocks: usize,
+ key: *const u8,
+ offset: u64,
+ offset_deltas: *const u64,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ pub fn blake3_hash_many_neon(
+ inputs: *const *const u8,
+ num_inputs: usize,
+ blocks: usize,
+ key: *const u8,
+ offset: u64,
+ offset_deltas: *const u64,
+ flags: u8,
+ flags_start: u8,
+ flags_end: u8,
+ out: *mut u8,
+ );
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 592d052..6ee4384 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,6 +7,12 @@ mod test;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[doc(hidden)]
pub mod avx2;
+#[cfg(feature = "c_avx512")]
+#[doc(hidden)]
+pub mod c_avx512;
+#[cfg(feature = "c_neon")]
+#[doc(hidden)]
+pub mod c_neon;
#[doc(hidden)]
pub mod platform;
#[doc(hidden)]
diff --git a/src/platform.rs b/src/platform.rs
index ed6bad7..99db59a 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,21 +1,39 @@
use crate::{portable, OffsetDeltas, BLOCK_LEN, KEY_LEN};
+#[cfg(feature = "c_avx512")]
+use crate::c_avx512;
+#[cfg(feature = "c_neon")]
+use crate::c_neon;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use crate::{avx2, sse41};
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub const MAX_SIMD_DEGREE: usize = 8;
-#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
-pub const MAX_SIMD_DEGREE: usize = 1;
+cfg_if::cfg_if! {
+ if #[cfg(feature = "c_avx512")] {
+ pub const MAX_SIMD_DEGREE: usize = 16;
+ } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+ pub const MAX_SIMD_DEGREE: usize = 8;
+ } else if #[cfg(feature = "c_neon")] {
+ pub const MAX_SIMD_DEGREE: usize = 4;
+ } else {
+ pub const MAX_SIMD_DEGREE: usize = 1;
+ }
+}
// There are some places where we want a static size that's equal to the
// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
// allowed to use cmp::max, so we have to hardcode this additional constant
// value. Get rid of this once cmp::max is a const fn.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
-#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
-pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
+cfg_if::cfg_if! {
+ if #[cfg(feature = "c_avx512")] {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
+ } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
+ } else if #[cfg(feature = "c_neon")] {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
+ } else {
+ pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
+ }
+}
#[derive(Clone, Copy, Debug)]
pub enum Platform {
@@ -24,12 +42,22 @@ pub enum Platform {
SSE41,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX2,
+ #[cfg(feature = "c_avx512")]
+ AVX512,
+ #[cfg(feature = "c_neon")]
+ NEON,
}
impl Platform {
pub fn detect() -> Self {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
+ #[cfg(feature = "c_avx512")]
+ {
+ if avx512_detected() {
+ return Platform::AVX512;
+ }
+ }
if avx2_detected() {
return Platform::AVX2;
}
@@ -37,6 +65,12 @@ impl Platform {
return Platform::SSE41;
}
}
+ // We don't use dynamic feature detection for NEON. If the "c_neon"
+ // feature is on, NEON is assumed to be supported.
+ #[cfg(feature = "c_neon")]
+ {
+ return Platform::NEON;
+ }
Platform::Portable
}
@@ -47,6 +81,10 @@ impl Platform {
Platform::SSE41 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => 8,
+ #[cfg(feature = "c_avx512")]
+ Platform::AVX512 => 16,
+ #[cfg(feature = "c_neon")]
+ Platform::NEON => 4,
};
debug_assert!(degree <= MAX_SIMD_DEGREE);
degree
@@ -67,6 +105,12 @@ impl Platform {
Platform::SSE41 | Platform::AVX2 => unsafe {
sse41::compress(cv, block, block_len, offset, flags)
},
+ // Safe because detect() checked for platform support.
+ #[cfg(feature = "c_avx512")]
+ Platform::AVX512 => unsafe { c_avx512::compress(cv, block, block_len, offset, flags) },
+ // No NEON compress() implementation yet.
+ #[cfg(feature = "c_neon")]
+ Platform::NEON => portable::compress(cv, block, block_len, offset, flags),
}
}
@@ -130,10 +174,58 @@ impl Platform {
out,
)
},
+ // Safe because detect() checked for platform support.
+ #[cfg(feature = "c_avx512")]
+ Platform::AVX512 => unsafe {
+ c_avx512::hash_many(
+ inputs,
+ key,
+ offset,
+ offset_deltas,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
+ // Assumed to be safe if the "c_neon" feature is on.
+ #[cfg(feature = "c_neon")]
+ Platform::NEON => unsafe {
+ c_neon::hash_many(
+ inputs,
+ key,
+ offset,
+ offset_deltas,
+ flags,
+ flags_start,
+ flags_end,
+ out,
+ )
+ },
}
}
}
+// Note that AVX-512 is divided into multiple featuresets, and we use two of
+// them, F and VL.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[inline(always)]
+pub fn avx512_detected() -> bool {
+ // Static check, e.g. for building with target-cpu=native.
+ #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
+ {
+ return true;
+ }
+ // Dyanmic check, if std is enabled.
+ #[cfg(feature = "std")]
+ {
+ if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
+ return true;
+ }
+ }
+ false
+}
+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
pub fn avx2_detected() -> bool {