diff options
| -rw-r--r-- | Cargo.toml | 6 | ||||
| -rw-r--r-- | build.rs | 65 | ||||
| -rw-r--r-- | src/c_avx512.rs | 109 | ||||
| -rw-r--r-- | src/c_neon.rs | 59 | ||||
| -rw-r--r-- | src/lib.rs | 6 | ||||
| -rw-r--r-- | src/platform.rs | 108 |
6 files changed, 343 insertions, 10 deletions
@@ -5,8 +5,9 @@ authors = ["Jack O'Connor <[email protected]>"] edition = "2018" [features] -default = ["c", "std"] -c = [] +default = ["std"] +c_avx512 = [] +c_neon = [] std = [] [dependencies] @@ -14,6 +15,7 @@ arrayref = "0.3.5" arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] } constant_time_eq = "0.1.4" rayon = { version = "1.2.1", optional = true } +cfg-if = "0.1.10" [dev-dependencies] page_size = "0.4.1" diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..f710508 --- /dev/null +++ b/build.rs @@ -0,0 +1,65 @@ +use std::env; + +fn defined(var: &str) -> bool { + env::var_os(var).is_some() +} + +fn is_windows() -> bool { + let target = env::var("TARGET").unwrap(); + let target_components: Vec<&str> = target.split("-").collect(); + let target_os = target_components[2]; + target_os == "windows" +} + +fn new_build() -> cc::Build { + let mut build = cc::Build::new(); + if !is_windows() { + build.flag("-std=c11"); + } + build +} + +fn main() -> Result<(), Box<dyn std::error::Error>> { + if defined("CARGO_FEATURE_C_AVX512") { + let mut build = new_build(); + build.file("src/c/blake3_avx512.c"); + if is_windows() { + // Note that a lot of versions of MSVC don't support /arch:AVX512, + // and they'll discard it with a warning, hopefully leading to a + // build error. + build.flag("/arch:AVX512"); + } else { + build.flag("-mavx512f"); + build.flag("-mavx512vl"); + } + build.compile("blake3_avx512"); + } + + if defined("CARGO_FEATURE_C_NEON") { + let mut build = new_build(); + build.file("src/c/blake3_neon.c"); + build.file("src/c/blake3_portable.c"); + // ARMv7 platforms that support NEON generally need the following + // flags. AArch64 supports NEON by default and does not support -mpfu. + // build.flag("-mfpu=neon-vfpv4"); + // build.flag("-mfloat-abi=hard"); + build.compile("blake3_neon"); + } + + // The `cc` crate does not automatically emit rerun-if directives for the + // environment variables it supports, in particular for $CC. We expect to + // do a lot of benchmarking across different compilers, so we explicitly + // add the variables that we're likely to need. + println!("cargo:rerun-if-env-changed=CC"); + println!("cargo:rerun-if-env-changed=CFLAGS"); + + // Ditto for source files, though these shouldn't change as often. + for file in std::fs::read_dir("src/c")? { + println!( + "cargo:rerun-if-changed={}", + file?.path().to_str().expect("utf-8") + ); + } + + Ok(()) +} diff --git a/src/c_avx512.rs b/src/c_avx512.rs new file mode 100644 index 0000000..f27c33a --- /dev/null +++ b/src/c_avx512.rs @@ -0,0 +1,109 @@ +use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress( + cv: &[u8; 32], + block: &[u8; BLOCK_LEN], + block_len: u8, + offset: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_avx512( + cv.as_ptr(), + block.as_ptr(), + block_len, + offset, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &[u8; KEY_LEN], + offset: u64, + offset_deltas: &OffsetDeltas, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx512( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + offset, + offset_deltas.as_ptr(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_avx512( + cv: *const u8, + block: *const u8, + block_len: u8, + offset: u64, + flags: u8, + out: *mut u8, + ); + // hash4/hash8/hash16 are exposed here for benchmarks. + pub fn blake3_hash4_avx512( + inputs: *const *const u8, + blocks: usize, + key: *const u8, + offset: u64, + offset_deltas: *const u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + pub fn blake3_hash8_avx512( + inputs: *const *const u8, + blocks: usize, + key: *const u8, + offset: u64, + offset_deltas: *const u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + pub fn blake3_hash16_avx512( + inputs: *const *const u8, + blocks: usize, + key: *const u8, + offset: u64, + offset_deltas: *const u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u8, + offset: u64, + offset_deltas: *const u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} diff --git a/src/c_neon.rs b/src/c_neon.rs new file mode 100644 index 0000000..de55aa7 --- /dev/null +++ b/src/c_neon.rs @@ -0,0 +1,59 @@ +use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting NEON. +pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>( + inputs: &[&A], + key: &[u8; KEY_LEN], + offset: u64, + offset_deltas: &OffsetDeltas, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_neon( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + offset, + offset_deltas.as_ptr(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + // Exposed here for benchmarks. + pub fn blake3_hash4_neon( + inputs: *const *const u8, + blocks: usize, + key: *const u8, + offset: u64, + offset_deltas: *const u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u8, + offset: u64, + offset_deltas: *const u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} @@ -7,6 +7,12 @@ mod test; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[doc(hidden)] pub mod avx2; +#[cfg(feature = "c_avx512")] +#[doc(hidden)] +pub mod c_avx512; +#[cfg(feature = "c_neon")] +#[doc(hidden)] +pub mod c_neon; #[doc(hidden)] pub mod platform; #[doc(hidden)] diff --git a/src/platform.rs b/src/platform.rs index ed6bad7..99db59a 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -1,21 +1,39 @@ use crate::{portable, OffsetDeltas, BLOCK_LEN, KEY_LEN}; +#[cfg(feature = "c_avx512")] +use crate::c_avx512; +#[cfg(feature = "c_neon")] +use crate::c_neon; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use crate::{avx2, sse41}; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub const MAX_SIMD_DEGREE: usize = 8; -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] -pub const MAX_SIMD_DEGREE: usize = 1; +cfg_if::cfg_if! { + if #[cfg(feature = "c_avx512")] { + pub const MAX_SIMD_DEGREE: usize = 16; + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + pub const MAX_SIMD_DEGREE: usize = 8; + } else if #[cfg(feature = "c_neon")] { + pub const MAX_SIMD_DEGREE: usize = 4; + } else { + pub const MAX_SIMD_DEGREE: usize = 1; + } +} // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently // allowed to use cmp::max, so we have to hardcode this additional constant // value. Get rid of this once cmp::max is a const fn. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub const MAX_SIMD_DEGREE_OR_2: usize = 8; -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] -pub const MAX_SIMD_DEGREE_OR_2: usize = 2; +cfg_if::cfg_if! { + if #[cfg(feature = "c_avx512")] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 16; + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 8; + } else if #[cfg(feature = "c_neon")] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 4; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 2; + } +} #[derive(Clone, Copy, Debug)] pub enum Platform { @@ -24,12 +42,22 @@ pub enum Platform { SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, + #[cfg(feature = "c_avx512")] + AVX512, + #[cfg(feature = "c_neon")] + NEON, } impl Platform { pub fn detect() -> Self { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + #[cfg(feature = "c_avx512")] + { + if avx512_detected() { + return Platform::AVX512; + } + } if avx2_detected() { return Platform::AVX2; } @@ -37,6 +65,12 @@ impl Platform { return Platform::SSE41; } } + // We don't use dynamic feature detection for NEON. If the "c_neon" + // feature is on, NEON is assumed to be supported. + #[cfg(feature = "c_neon")] + { + return Platform::NEON; + } Platform::Portable } @@ -47,6 +81,10 @@ impl Platform { Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, + #[cfg(feature = "c_avx512")] + Platform::AVX512 => 16, + #[cfg(feature = "c_neon")] + Platform::NEON => 4, }; debug_assert!(degree <= MAX_SIMD_DEGREE); degree @@ -67,6 +105,12 @@ impl Platform { Platform::SSE41 | Platform::AVX2 => unsafe { sse41::compress(cv, block, block_len, offset, flags) }, + // Safe because detect() checked for platform support. + #[cfg(feature = "c_avx512")] + Platform::AVX512 => unsafe { c_avx512::compress(cv, block, block_len, offset, flags) }, + // No NEON compress() implementation yet. + #[cfg(feature = "c_neon")] + Platform::NEON => portable::compress(cv, block, block_len, offset, flags), } } @@ -130,10 +174,58 @@ impl Platform { out, ) }, + // Safe because detect() checked for platform support. + #[cfg(feature = "c_avx512")] + Platform::AVX512 => unsafe { + c_avx512::hash_many( + inputs, + key, + offset, + offset_deltas, + flags, + flags_start, + flags_end, + out, + ) + }, + // Assumed to be safe if the "c_neon" feature is on. + #[cfg(feature = "c_neon")] + Platform::NEON => unsafe { + c_neon::hash_many( + inputs, + key, + offset, + offset_deltas, + flags, + flags_start, + flags_end, + out, + ) + }, } } } +// Note that AVX-512 is divided into multiple featuresets, and we use two of +// them, F and VL. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx512_detected() -> bool { + // Static check, e.g. for building with target-cpu=native. + #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] + { + return true; + } + // Dyanmic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + return true; + } + } + false +} + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] pub fn avx2_detected() -> bool { |
