From efbfa0463c793dc1319db10ca4e3b809937b227d Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Tue, 11 Feb 2020 14:13:30 -0500 Subject: integrate assembly implementations into the blake3 crate --- .github/workflows/ci.yml | 26 +- Cargo.toml | 19 +- README.md | 20 +- b3sum/Cargo.toml | 4 +- benches/bench.rs | 162 ++++------ build.rs | 107 +++++-- src/avx2.rs | 474 ---------------------------- src/c_avx2.rs | 63 ++++ src/c_avx512.rs | 3 - src/c_neon.rs | 2 - src/c_sse41.rs | 114 +++++++ src/lib.rs | 38 ++- src/platform.rs | 89 ++++-- src/rust_avx2.rs | 474 ++++++++++++++++++++++++++++ src/rust_sse41.rs | 766 +++++++++++++++++++++++++++++++++++++++++++++ src/sse41.rs | 766 --------------------------------------------- test_vectors/Cargo.toml | 8 +- test_vectors/cross_test.sh | 2 +- 18 files changed, 1705 insertions(+), 1432 deletions(-) delete mode 100644 src/avx2.rs create mode 100644 src/c_avx2.rs create mode 100644 src/c_sse41.rs create mode 100644 src/rust_avx2.rs create mode 100644 src/rust_sse41.rs delete mode 100644 src/sse41.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3da4e5..db7decd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,22 +24,30 @@ jobs: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} profile: minimal override: true - # Default tests. - - run: cargo test - # No-default-features tests. + # Default tests plus Rayon. + - run: cargo test --features=rayon + # no_std tests. - run: cargo test --no-default-features - # More features tests. Note that "c_avx512" participates in dynamic feature - # detection, so it'll be built, but it probably won't run. - - run: cargo test --features=c_avx512,rayon + # Test the x86 assembly implementations. Use -vv to log compiler commands. + - run: cargo test --features=c -vv + # Test the C intrinsics implementations. Use -vv to log compiler commands. + - run: cargo test --features=c,c_prefer_intrinsics -vv # Test release mode. This does more iteratations in test_fuzz_hasher. - run: cargo test --release - # Test benchmarks. Nightly only. - - run: cargo test --benches - if: matrix.rust_version == 'nightly' + # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. + - run: cargo test --benches --features=c + env: + RUSTC_BOOTSTRAP: 1 # Test vectors. - name: test vectors run: cargo test working-directory: ./test_vectors + - name: test vectors + run: cargo test --features=c + working-directory: ./test_vectors + - name: test vectors + run: cargo test --features=c,c_prefer_intrinsics + working-directory: ./test_vectors # Test b3sum. - name: test b3sum run: cargo test diff --git a/Cargo.toml b/Cargo.toml index 4d8e7cf..1a659ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,10 +11,21 @@ edition = "2018" [features] default = ["std"] -# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU -# feature detection. A binary with "c_avx512" on is still cross-platform. This -# feature has no effect on non-x86. -c_avx512 = [] +# The "c" feature includes C and assembly SIMD implementations of the +# compression function for x86 platforms, called via FFI. (Currently it has no +# effect on other platforms.) This requires a C toolchain on the build machine. +# This is necessary for AVX-512 support, which is not yet stable in Rust, and +# the assembly implementations also perform better than those using Rust/LLVM +# intrinsics. As with the Rust implementations, these C and assembly +# implementations participate in runtime CPU feature detection, and the +# resulting binary is portable. +c = [] +# Normally x86-64 builds prefer assembly implementations over C intrinsics. The +# assembly implementations perform better, perform most consistently across +# compilers, and are much faster to build. However, this feature makes the +# build use the C intrinsics implementations instead. This is mainly for +# testing purposes, and most callers will not want to use it. +c_prefer_intrinsics = [] # The NEON implementation does not participate in dynamic feature detection, # which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note # that AArch64 always supports NEON, but support on ARMv7 varies. diff --git a/README.md b/README.md index 8f881dd..a8ad4c7 100644 --- a/README.md +++ b/README.md @@ -33,19 +33,18 @@ with BLAKE3. This repository is the official implementation of BLAKE3. It includes: * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which - includes optimized SIMD implementations, with dynamic CPU feature - detection on x86. SSE4.1 and AVX2 support are implemented in Rust, - while AVX-512 and ARM NEON support are imported from the C - implementation and controlled by the `c_avx512` and `c_neon` features. - Multi-threading is implemented with - [Rayon](https://github.com/rayon-rs/rayon) and controlled by the - `rayon` feature. + includes optimized SIMD implementations, with runtime CPU feature + detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c` + feature enables C/assembly implementations and AVX-512 support. The + `c_neon` feature enables ARM NEON support. Multi-threading is also + supported, and the `rayon` feature provides a + [Rayon](https://github.com/rayon-rs/rayon)-based implementation. * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which provides a command line interface. You can install it from [crates.io](https://crates.io/crates/b3sum) with `cargo install - b3sum`. It enables the multi-threading and AVX-512 features of the - `blake3` crate by default. + b3sum`. It enables the `rayon` and `c` features of the `blake3` crate + by default. * The [C implementation](c), which like the Rust implementation includes SIMD code and dynamic CPU feature detection on x86. Unlike the Rust @@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* ## Usage -This repository provides the `b3sum` command line utility and the -`blake3` Rust crate. - ### The `b3sum` utility The `b3sum` utility allows you to process files and data from standard diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml index c4c8068..aaa23e9 100644 --- a/b3sum/Cargo.toml +++ b/b3sum/Cargo.toml @@ -9,8 +9,8 @@ readme = "README.md" edition = "2018" [features] -default = ["c_avx512", "rayon"] -c_avx512 = ["blake3/c_avx512"] +default = ["c", "rayon"] +c = ["blake3/c"] c_neon = ["blake3/c_neon"] rayon = ["blake3/rayon", "memmap"] diff --git a/benches/bench.rs b/benches/bench.rs index 0d73970..70be967 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -4,7 +4,7 @@ extern crate test; use arrayref::array_ref; use arrayvec::ArrayVec; -use blake3::platform::MAX_SIMD_DEGREE; +use blake3::platform::{Platform, MAX_SIMD_DEGREE}; use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use rand::prelude::*; use test::Bencher; @@ -48,173 +48,149 @@ impl RandomInput { } } -type CompressInPlaceFn = - unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); - -fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) { +fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { let mut state = [1u32; 8]; let mut r = RandomInput::new(b, 64); let input = array_ref!(r.get(), 0, 64); - unsafe { - b.iter(|| f(&mut state, input, 64 as u8, 0, 0)); - } + b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); } #[bench] fn bench_single_compression_portable(b: &mut Bencher) { - bench_single_compression_fn(b, blake3::portable::compress_in_place); + bench_single_compression_fn(b, Platform::portable()); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse41(b: &mut Bencher) { - if !blake3::platform::sse41_detected() { - return; + if let Some(platform) = Platform::sse41() { + bench_single_compression_fn(b, platform); } - bench_single_compression_fn(b, blake3::sse41::compress_in_place); } #[bench] -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] fn bench_single_compression_avx512(b: &mut Bencher) { - if !blake3::platform::avx512_detected() { - return; + if let Some(platform) = Platform::avx512() { + bench_single_compression_fn(b, platform); } - bench_single_compression_fn(b, blake3::c_avx512::compress_in_place); } -type HashManyFn = unsafe fn( - inputs: &[&A], - key: &[u32; 8], - counter: u64, - increment_counter: blake3::IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -); - -fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) { +fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, CHUNK_LEN)); } - unsafe { - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - f( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::Yes, - 0, - 0, - 0, - &mut out, - ); - }); - } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::Yes, + 0, + 0, + 0, + &mut out, + ); + }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse41(b: &mut Bencher) { - if !blake3::platform::sse41_detected() { - return; + if let Some(platform) = Platform::sse41() { + bench_many_chunks_fn(b, platform); } - bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_avx2(b: &mut Bencher) { - if !blake3::platform::avx2_detected() { - return; + if let Some(platform) = Platform::avx2() { + bench_many_chunks_fn(b, platform); } - bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE); } #[bench] -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] fn bench_many_chunks_avx512(b: &mut Bencher) { - if !blake3::platform::avx512_detected() { - return; + if let Some(platform) = Platform::avx512() { + bench_many_chunks_fn(b, platform); } - bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE); } #[bench] #[cfg(feature = "c_neon")] fn bench_many_chunks_neon(b: &mut Bencher) { - // When "c_neon" is on, NEON support is assumed. - bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE); + if let Some(platform) = Platform::neon() { + bench_many_chunks_fn(b, platform); + } } // TODO: When we get const generics we can unify this with the chunks code. -fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) { +fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, BLOCK_LEN)); } - unsafe { - b.iter(|| { - let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - f( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::No, - 0, - 0, - 0, - &mut out, - ); - }); - } + b.iter(|| { + let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::No, + 0, + 0, + 0, + &mut out, + ); + }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse41(b: &mut Bencher) { - if !blake3::platform::sse41_detected() { - return; + if let Some(platform) = Platform::sse41() { + bench_many_parents_fn(b, platform); } - bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_avx2(b: &mut Bencher) { - if !blake3::platform::avx2_detected() { - return; + if let Some(platform) = Platform::avx2() { + bench_many_parents_fn(b, platform); } - bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE); } #[bench] -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] fn bench_many_parents_avx512(b: &mut Bencher) { - if !blake3::platform::avx512_detected() { - return; + if let Some(platform) = Platform::avx512() { + bench_many_parents_fn(b, platform); } - bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE); } #[bench] #[cfg(feature = "c_neon")] fn bench_many_parents_neon(b: &mut Bencher) { - // When "c_neon" is on, NEON support is assumed. - bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE); + if let Some(platform) = Platform::neon() { + bench_many_parents_fn(b, platform); + } } fn bench_atonce(b: &mut Bencher, len: usize) { diff --git a/build.rs b/build.rs index 67fe3fc..c5a662d 100644 --- a/build.rs +++ b/build.rs @@ -13,6 +13,11 @@ fn is_x86_64() -> bool { target_components()[0] == "x86_64" } +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + fn is_armv7() -> bool { target_components()[0] == "armv7" } @@ -28,6 +33,13 @@ fn is_windows_msvc() -> bool { && target_components()[3] == "msvc" } +fn is_windows_gnu() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "gnu" +} + fn new_build() -> cc::Build { let mut build = cc::Build::new(); if !is_windows_msvc() { @@ -37,16 +49,16 @@ fn new_build() -> cc::Build { } const WINDOWS_MSVC_ERROR: &str = r#" -The "c_avx512" feature is enabled, but your version of the MSVC C compiler does -not support the "/arch:AVX512" flag. If you are building the "b3sum" or -"bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features" -flag. (Note that this also disables other default features like Rayon-based +The "c" feature is enabled, but your version of the MSVC C compiler does not +support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin" +crates, you can disable AVX-512 with Cargo's "--no-default-features" flag. +(Note that this also disables other default features like Rayon-based multithreading, which you can re-enable with "--features=rayon".) Other crates might or might not support this workaround. "#; const GNU_ERROR: &str = r#" -The "c_avx512" feature is enabled, but your C compiler does not support the +The "c" feature is enabled, but your C compiler does not support the "-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also disables other default features like Rayon-based multithreading, which you can @@ -69,25 +81,76 @@ fn check_for_avx512_compiler_support(build: &cc::Build) { } fn main() -> Result<(), Box> { - // "c_avx512' is a no-op for non-x86_64 targets. It also participates in - // dynamic CPU feature detection, so it's generally safe to enable. - // However, it probably won't build in some older environments without - // AVX-512 support in the C compiler, and it's disabled by default for that - // reason. - if defined("CARGO_FEATURE_C_AVX512") && is_x86_64() { - let mut build = new_build(); - check_for_avx512_compiler_support(&build); - build.file("c/blake3_avx512.c"); - if is_windows_msvc() { - // Note that a lot of versions of MSVC don't support /arch:AVX512, - // and they'll discard it with a warning, hopefully leading to a - // build error. - build.flag("/arch:AVX512"); + if defined("CARGO_FEATURE_C") { + if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") { + // On 64-bit, use the assembly implementations, unless the + // "c_prefer_intrinsics" feature is enabled. + if is_windows_msvc() { + let mut build = new_build(); + build.file("c/blake3-sse41-x86_64-windows-msvc.asm"); + build.file("c/blake3-avx2-x86_64-windows-msvc.asm"); + build.file("c/blake3-avx512-x86_64-windows-msvc.asm"); + build.compile("blake3_asm"); + } else if is_windows_gnu() { + let mut build = new_build(); + build.file("c/blake3-sse41-x86_64-windows-gnu.S"); + build.file("c/blake3-avx2-x86_64-windows-gnu.S"); + build.file("c/blake3-avx512-x86_64-windows-gnu.S"); + build.compile("blake3_asm"); + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + let mut build = new_build(); + build.file("c/blake3-sse41-x86_64-unix.S"); + build.file("c/blake3-avx2-x86_64-unix.S"); + build.file("c/blake3-avx512-x86_64-unix.S"); + build.compile("blake3_asm"); + } + } else if is_x86_64() || is_x86_32() { + // Assembly implementations are only for 64-bit. On 32-bit, or if + // the "c_prefer_intrinsics" feature is enabled, use the + // intrinsics-based C implementations. These each need to be + // compiled separately, with the corresponding instruction set + // extension explicitly enabled in the compiler. + + let mut sse41_build = new_build(); + sse41_build.file("c/blake3_sse41.c"); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrisincs: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse41_build.flag("-msse4.1"); + } + sse41_build.compile("blake3_sse41"); + + let mut avx2_build = new_build(); + avx2_build.file("c/blake3_avx2.c"); + if is_windows_msvc() { + avx2_build.flag("/arch:AVX2"); + } else { + avx2_build.flag("-mavx2"); + } + avx2_build.compile("blake3_avx2"); + + let mut avx512_build = new_build(); + check_for_avx512_compiler_support(&avx512_build); + avx512_build.file("c/blake3_avx512.c"); + if is_windows_msvc() { + // Note that a lot of versions of MSVC don't support /arch:AVX512, + // and they'll discard it with a warning, hopefully leading to a + // build error. + avx512_build.flag("/arch:AVX512"); + } else { + avx512_build.flag("-mavx512f"); + avx512_build.flag("-mavx512vl"); + } + avx512_build.compile("blake3_avx512"); } else { - build.flag("-mavx512f"); - build.flag("-mavx512vl"); + // Currently no effect for non-x86 platforms. } - build.compile("blake3_avx512"); } if defined("CARGO_FEATURE_C_NEON") { diff --git a/src/avx2.rs b/src/avx2.rs deleted file mode 100644 index 7f36072..0000000 --- a/src/avx2.rs +++ /dev/null @@ -1,474 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, -}; -use arrayref::{array_mut_ref, mut_array_refs}; - -pub const DEGREE: usize = 8; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m256i { - // This is an unaligned load, so the pointer cast is allowed. - _mm256_loadu_si256(src as *const __m256i) -} - -#[inline(always)] -unsafe fn storeu(src: __m256i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm256_storeu_si256(dest as *mut __m256i, src) -} - -#[inline(always)] -unsafe fn add(a: __m256i, b: __m256i) -> __m256i { - _mm256_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { - _mm256_xor_si256(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m256i { - _mm256_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { - _mm256_setr_epi32( - a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, - ) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { - ( - _mm256_permute2x128_si256(a, b, 0x20), - _mm256_permute2x128_si256(a, b, 0x31), - ) -} - -// There are several ways to do a transposition. We could do it naively, with 8 separate -// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy -// the vecs into contiguous storage and then use gather instructions. This third approach is to use -// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the -// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the -// https://github.com/oconnor663/bao_experiments repo. -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. - let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. - let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); - let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); - let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); - let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); - - vecs[0] = abcdefgh_0; - vecs[1] = abcdefgh_1; - vecs[2] = abcdefgh_2; - vecs[3] = abcdefgh_3; - vecs[4] = abcdefgh_4; - vecs[5] = abcdefgh_5; - vecs[6] = abcdefgh_6; - vecs[7] = abcdefgh_7; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set8( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - counter_low(counter + (mask & 4)), - counter_low(counter + (mask & 5)), - counter_low(counter + (mask & 6)), - counter_low(counter + (mask & 7)), - ), - set8( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - counter_high(counter + (mask & 4)), - counter_high(counter + (mask & 5)), - counter_high(counter + (mask & 6)), - counter_high(counter + (mask & 7)), - ), - ) -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash8( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&mut h_vecs); - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash_many>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash8( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - crate::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ); -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::avx2_detected() { - return; - } - - #[target_feature(enable = "avx2")] - unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_hash_many() { - if !crate::platform::avx2_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/src/c_avx2.rs b/src/c_avx2.rs new file mode 100644 index 0000000..d805e86 --- /dev/null +++ b/src/c_avx2.rs @@ -0,0 +1,63 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Note that there is no AVX2 implementation of compress_in_place or +// compress_xof. + +// Unsafe because this may only be called on platforms supporting AVX2. +pub unsafe fn hash_many>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/c_avx512.rs b/src/c_avx512.rs index f20de2c..c1b9f64 100644 --- a/src/c_avx512.rs +++ b/src/c_avx512.rs @@ -1,7 +1,5 @@ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; -pub const DEGREE: usize = 16; - // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_in_place( cv: &mut CVWords, @@ -91,7 +89,6 @@ pub mod ffi { flags_end: u8, out: *mut u8, ); - } } diff --git a/src/c_neon.rs b/src/c_neon.rs index 34ef074..77b9654 100644 --- a/src/c_neon.rs +++ b/src/c_neon.rs @@ -1,7 +1,5 @@ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; -pub const DEGREE: usize = 4; - // Unsafe because this may only be called on platforms supporting NEON. pub unsafe fn hash_many>( inputs: &[&A], diff --git a/src/c_sse41.rs b/src/c_sse41.rs new file mode 100644 index 0000000..0b64c90 --- /dev/null +++ b/src/c_sse41.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse41( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn hash_many>( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse41( + inputs.as_ptr() as *const *const u8, + inputs.len(), + A::CAPACITY / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/lib.rs b/src/lib.rs index 7fa3510..58d2dbe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -39,24 +39,32 @@ mod test; #[doc(hidden)] pub mod guts; -// These modules are pub for benchmarks only. They are not stable. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[doc(hidden)] -pub mod avx2; -#[cfg(feature = "c_avx512")] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[doc(hidden)] -pub mod c_avx512; -#[cfg(feature = "c_neon")] -#[doc(hidden)] -pub mod c_neon; +// The platform module is pub for benchmarks only. It is not stable. #[doc(hidden)] pub mod platform; -#[doc(hidden)] -pub mod portable; + +// Platform-specific implementations of the compression function. +mod portable; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[doc(hidden)] -pub mod sse41; +cfg_if::cfg_if! { + if #[cfg(feature = "c")] { + #[path = "c_sse41.rs"] + mod sse41; + #[path = "c_avx2.rs"] + mod avx2; + #[path = "c_avx512.rs"] + mod avx512; + } else { + #[path = "rust_sse41.rs"] + mod sse41; + #[path = "rust_avx2.rs"] + mod avx2; + // Stable Rust does not currently support AVX-512. + } +} +#[cfg(feature = "c_neon")] +#[path = "c_neon.rs"] +mod neon; pub mod traits; diff --git a/src/platform.rs b/src/platform.rs index b453a6e..163cbbb 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -1,18 +1,10 @@ use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; -#[cfg(feature = "c_avx512")] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::c_avx512; -#[cfg(feature = "c_neon")] -use crate::c_neon; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use crate::{avx2, sse41}; - cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { - if #[cfg(feature = "c_avx512")] { + if #[cfg(feature = "c")] { pub const MAX_SIMD_DEGREE: usize = 16; } else { pub const MAX_SIMD_DEGREE: usize = 8; @@ -32,7 +24,7 @@ cfg_if::cfg_if! { cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { - if #[cfg(feature = "c_avx512")] { + if #[cfg(feature = "c")] { pub const MAX_SIMD_DEGREE_OR_2: usize = 16; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 8; @@ -52,7 +44,7 @@ pub enum Platform { SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX512, #[cfg(feature = "c_neon")] @@ -64,7 +56,7 @@ impl Platform { pub fn detect() -> Self { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] { if avx512_detected() { return Platform::AVX512; @@ -93,7 +85,7 @@ impl Platform { Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => 16, #[cfg(feature = "c_neon")] @@ -103,7 +95,7 @@ impl Platform { degree } - pub(crate) fn compress_in_place( + pub fn compress_in_place( &self, cv: &mut CVWords, block: &[u8; BLOCK_LEN], @@ -116,13 +108,13 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress_in_place(cv, block, block_len, counter, flags) + crate::sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::compress_in_place(cv, block, block_len, counter, flags) + crate::avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(feature = "c_neon")] @@ -130,7 +122,7 @@ impl Platform { } } - pub(crate) fn compress_xof( + pub fn compress_xof( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], @@ -143,13 +135,13 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { - sse41::compress_xof(cv, block, block_len, counter, flags) + crate::sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::compress_xof(cv, block, block_len, counter, flags) + crate::avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(feature = "c_neon")] @@ -167,7 +159,7 @@ impl Platform { // after every block, there's a small but measurable performance loss. // Compressing chunks with a dedicated loop avoids this. - pub(crate) fn hash_many>( + pub fn hash_many>( &self, inputs: &[&A], key: &CVWords, @@ -192,7 +184,7 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => unsafe { - sse41::hash_many( + crate::sse41::hash_many( inputs, key, counter, @@ -206,7 +198,7 @@ impl Platform { // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { - avx2::hash_many( + crate::avx2::hash_many( inputs, key, counter, @@ -218,10 +210,10 @@ impl Platform { ) }, // Safe because detect() checked for platform support. - #[cfg(feature = "c_avx512")] + #[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { - c_avx512::hash_many( + crate::avx512::hash_many( inputs, key, counter, @@ -235,7 +227,7 @@ impl Platform { // Assumed to be safe if the "c_neon" feature is on. #[cfg(feature = "c_neon")] Platform::NEON => unsafe { - c_neon::hash_many( + crate::neon::hash_many( inputs, key, counter, @@ -248,11 +240,52 @@ impl Platform { }, } } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } + + #[cfg(feature = "c")] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx512() -> Option { + if avx512_detected() { + Some(Self::AVX512) + } else { + None + } + } + + #[cfg(feature = "c_neon")] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn neon() -> Option { + // Assumed to be safe if the "c_neon" feature is on. + Some(Self::NEON) + } } // Note that AVX-512 is divided into multiple featuresets, and we use two of // them, F and VL. -#[cfg(feature = "c_avx512")] +#[cfg(feature = "c")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] pub fn avx512_detected() -> bool { diff --git a/src/rust_avx2.rs b/src/rust_avx2.rs new file mode 100644 index 0000000..7f36072 --- /dev/null +++ b/src/rust_avx2.rs @@ -0,0 +1,474 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; +use arrayref::{array_mut_ref, mut_array_refs}; + +pub const DEGREE: usize = 8; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m256i { + _mm256_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { + _mm256_setr_epi32( + a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, + ) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { + ( + _mm256_permute2x128_si256(a, b, 0x20), + _mm256_permute2x128_si256(a, b, 0x31), + ) +} + +// There are several ways to do a transposition. We could do it naively, with 8 separate +// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy +// the vecs into contiguous storage and then use gather instructions. This third approach is to use +// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the +// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the +// https://github.com/oconnor663/bao_experiments repo. +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. + let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. + let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); + let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); + let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); + let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); + + vecs[0] = abcdefgh_0; + vecs[1] = abcdefgh_1; + vecs[2] = abcdefgh_2; + vecs[3] = abcdefgh_3; + vecs[4] = abcdefgh_4; + vecs[5] = abcdefgh_5; + vecs[6] = abcdefgh_6; + vecs[7] = abcdefgh_7; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set8( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + counter_low(counter + (mask & 4)), + counter_low(counter + (mask & 5)), + counter_low(counter + (mask & 6)), + counter_low(counter + (mask & 7)), + ), + set8( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + counter_high(counter + (mask & 4)), + counter_high(counter + (mask & 5)), + counter_high(counter + (mask & 6)), + counter_high(counter + (mask & 7)), + ), + ) +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash8( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&mut h_vecs); + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash_many>( + mut inputs: &[&A], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = A::CAPACITY / BLOCK_LEN; + hash8( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::avx2_detected() { + return; + } + + #[target_feature(enable = "avx2")] + unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/rust_sse41.rs b/src/rust_sse41.rs new file mode 100644 index 0000000..fcf2f98 --- /dev/null +++ b/src/rust_sse41.rs @@ -0,0 +1,766 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn hash1>( + input: &A, + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = input.as_slice(); + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash_many>( + mut inputs: &[&A], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = A::CAPACITY / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse41_detected() { + return; + } + + #[target_feature(enable = "sse4.1")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/sse41.rs b/src/sse41.rs deleted file mode 100644 index fcf2f98..0000000 --- a/src/sse41.rs +++ /dev/null @@ -1,766 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; -use arrayref::{array_mut_ref, array_ref, mut_array_refs}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for i in 0..DEGREE { - _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0); - } - let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - transpose_vecs(squares.2); - transpose_vecs(squares.3); - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); - transpose_vecs(squares.0); - transpose_vecs(squares.1); - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse4.1")] -unsafe fn hash1>( - input: &A, - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = input.as_slice(); - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - array_ref!(slice, 0, BLOCK_LEN), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash_many>( - mut inputs: &[&A], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = A::CAPACITY / BLOCK_LEN; - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - array_mut_ref!(out, 0, DEGREE * OUT_LEN), - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - out = &mut out[DEGREE * OUT_LEN..]; - } - for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { - hash1( - input, - key, - counter, - flags, - flags_start, - flags_end, - array_mut_ref!(output, 0, OUT_LEN), - ); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_transpose() { - if !crate::platform::sse41_detected() { - return; - } - - #[target_feature(enable = "sse4.1")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !crate::platform::sse41_detected() { - return; - } - crate::test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml index 007d1c8..2a90e39 100644 --- a/test_vectors/Cargo.toml +++ b/test_vectors/Cargo.toml @@ -3,10 +3,16 @@ name = "test_vectors" version = "0.0.0" edition = "2018" +[features] +default = [] +c = ["blake3/c"] +c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"] +c_neon = ["blake3/c_neon"] + [dependencies] # If you ever change these path dependencies, you'll probably need to update # cross_test.sh, or CI will break. I'm sorry >.< -blake3 = { path = "../", features=["c_avx512"] } +blake3 = { path = "../" } hex = "0.4.0" reference_impl = { path = "../reference_impl" } serde = { version = "1.0", features = ["derive"] } diff --git a/test_vectors/cross_test.sh b/test_vectors/cross_test.sh index 1f6a34b..c4d280c 100755 --- a/test_vectors/cross_test.sh +++ b/test_vectors/cross_test.sh @@ -19,7 +19,7 @@ mv blake3/test_vectors . mv blake3/reference_impl test_vectors mv blake3 test_vectors cd test_vectors -sed -i 's|blake3 = { path = "../", features=\["c_avx512"\] }|blake3 = { path = "./blake3" }|' Cargo.toml +sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml cross test "$@" -- cgit v1.2.3