From efbfa0463c793dc1319db10ca4e3b809937b227d Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Tue, 11 Feb 2020 14:13:30 -0500
Subject: integrate assembly implementations into the blake3 crate

---
 .github/workflows/ci.yml   |  26 +-
 Cargo.toml                 |  19 +-
 README.md                  |  20 +-
 b3sum/Cargo.toml           |   4 +-
 benches/bench.rs           | 162 ++++------
 build.rs                   | 107 +++++--
 src/avx2.rs                | 474 ----------------------------
 src/c_avx2.rs              |  63 ++++
 src/c_avx512.rs            |   3 -
 src/c_neon.rs              |   2 -
 src/c_sse41.rs             | 114 +++++++
 src/lib.rs                 |  38 ++-
 src/platform.rs            |  89 ++++--
 src/rust_avx2.rs           | 474 ++++++++++++++++++++++++++++
 src/rust_sse41.rs          | 766 +++++++++++++++++++++++++++++++++++++++++++++
 src/sse41.rs               | 766 ---------------------------------------------
 test_vectors/Cargo.toml    |   8 +-
 test_vectors/cross_test.sh |   2 +-
 18 files changed, 1705 insertions(+), 1432 deletions(-)
 delete mode 100644 src/avx2.rs
 create mode 100644 src/c_avx2.rs
 create mode 100644 src/c_sse41.rs
 create mode 100644 src/rust_avx2.rs
 create mode 100644 src/rust_sse41.rs
 delete mode 100644 src/sse41.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e3da4e5..db7decd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,22 +24,30 @@ jobs:
         toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }}
         profile: minimal
         override: true
-    # Default tests.
-    - run: cargo test
-    # No-default-features tests.
+    # Default tests plus Rayon.
+    - run: cargo test --features=rayon
+    # no_std tests.
     - run: cargo test --no-default-features
-    # More features tests. Note that "c_avx512" participates in dynamic feature
-    # detection, so it'll be built, but it probably won't run.
-    - run: cargo test --features=c_avx512,rayon
+    # Test the x86 assembly implementations. Use -vv to log compiler commands.
+    - run: cargo test --features=c -vv
+    # Test the C intrinsics implementations. Use -vv to log compiler commands.
+    - run: cargo test --features=c,c_prefer_intrinsics -vv
     # Test release mode. This does more iteratations in test_fuzz_hasher.
     - run: cargo test --release
-    # Test benchmarks. Nightly only.
-    - run: cargo test --benches
-      if: matrix.rust_version == 'nightly'
+    # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains.
+    - run: cargo test --benches --features=c
+      env:
+        RUSTC_BOOTSTRAP: 1
     # Test vectors.
     - name: test vectors
       run: cargo test
       working-directory: ./test_vectors
+    - name: test vectors
+      run: cargo test --features=c
+      working-directory: ./test_vectors
+    - name: test vectors
+      run: cargo test --features=c,c_prefer_intrinsics
+      working-directory: ./test_vectors
     # Test b3sum.
     - name: test b3sum
       run: cargo test
diff --git a/Cargo.toml b/Cargo.toml
index 4d8e7cf..1a659ef 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,10 +11,21 @@ edition = "2018"
 
 [features]
 default = ["std"]
-# Like SSE4.1 and AVX2, the AVX-512 implementation participates in dynamic CPU
-# feature detection. A binary with "c_avx512" on is still cross-platform. This
-# feature has no effect on non-x86.
-c_avx512 = []
+# The "c" feature includes C and assembly SIMD implementations of the
+# compression function for x86 platforms, called via FFI. (Currently it has no
+# effect on other platforms.) This requires a C toolchain on the build machine.
+# This is necessary for AVX-512 support, which is not yet stable in Rust, and
+# the assembly implementations also perform better than those using Rust/LLVM
+# intrinsics. As with the Rust implementations, these C and assembly
+# implementations participate in runtime CPU feature detection, and the
+# resulting binary is portable.
+c = []
+# Normally x86-64 builds prefer assembly implementations over C intrinsics. The
+# assembly implementations perform better, perform most consistently across
+# compilers, and are much faster to build. However, this feature makes the
+# build use the C intrinsics implementations instead. This is mainly for
+# testing purposes, and most callers will not want to use it.
+c_prefer_intrinsics = []
 # The NEON implementation does not participate in dynamic feature detection,
 # which is currently x86-only. If "c_neon" is on, NEON support is assumed. Note
 # that AArch64 always supports NEON, but support on ARMv7 varies.
diff --git a/README.md b/README.md
index 8f881dd..a8ad4c7 100644
--- a/README.md
+++ b/README.md
@@ -33,19 +33,18 @@ with BLAKE3.
 This repository is the official implementation of BLAKE3. It includes:
 
 * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
-  includes optimized SIMD implementations, with dynamic CPU feature
-  detection on x86. SSE4.1 and AVX2 support are implemented in Rust,
-  while AVX-512 and ARM NEON support are imported from the C
-  implementation and controlled by the `c_avx512` and `c_neon` features.
-  Multi-threading is implemented with
-  [Rayon](https://github.com/rayon-rs/rayon) and controlled by the
-  `rayon` feature. 
+  includes optimized SIMD implementations, with runtime CPU feature
+  detection on x86. SSE4.1 and AVX2 are supported in pure Rust. The `c`
+  feature enables C/assembly implementations and AVX-512 support. The
+  `c_neon` feature enables ARM NEON support. Multi-threading is also
+  supported, and the `rayon` feature provides a
+  [Rayon](https://github.com/rayon-rs/rayon)-based implementation.
 
 * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
   provides a command line interface. You can install it from
   [crates.io](https://crates.io/crates/b3sum) with `cargo install
-  b3sum`. It enables the multi-threading and AVX-512 features of the
-  `blake3` crate by default.
+  b3sum`. It enables the `rayon` and `c` features of the `blake3` crate
+  by default.
 
 * The [C implementation](c), which like the Rust implementation includes
   SIMD code and dynamic CPU feature detection on x86. Unlike the Rust
@@ -80,9 +79,6 @@ we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
 
 ## Usage
 
-This repository provides the `b3sum` command line utility and the
-`blake3` Rust crate.
-
 ### The `b3sum` utility
 
 The `b3sum` utility allows you to process files and data from standard
diff --git a/b3sum/Cargo.toml b/b3sum/Cargo.toml
index c4c8068..aaa23e9 100644
--- a/b3sum/Cargo.toml
+++ b/b3sum/Cargo.toml
@@ -9,8 +9,8 @@ readme = "README.md"
 edition = "2018"
 
 [features]
-default = ["c_avx512", "rayon"]
-c_avx512 = ["blake3/c_avx512"]
+default = ["c", "rayon"]
+c = ["blake3/c"]
 c_neon = ["blake3/c_neon"]
 rayon = ["blake3/rayon", "memmap"]
 
diff --git a/benches/bench.rs b/benches/bench.rs
index 0d73970..70be967 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -4,7 +4,7 @@ extern crate test;
 
 use arrayref::array_ref;
 use arrayvec::ArrayVec;
-use blake3::platform::MAX_SIMD_DEGREE;
+use blake3::platform::{Platform, MAX_SIMD_DEGREE};
 use blake3::{BLOCK_LEN, CHUNK_LEN, OUT_LEN};
 use rand::prelude::*;
 use test::Bencher;
@@ -48,173 +48,149 @@ impl RandomInput {
     }
 }
 
-type CompressInPlaceFn =
-    unsafe fn(cv: &mut [u32; 8], block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
-
-fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) {
+fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
     let mut state = [1u32; 8];
     let mut r = RandomInput::new(b, 64);
     let input = array_ref!(r.get(), 0, 64);
-    unsafe {
-        b.iter(|| f(&mut state, input, 64 as u8, 0, 0));
-    }
+    b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
 }
 
 #[bench]
 fn bench_single_compression_portable(b: &mut Bencher) {
-    bench_single_compression_fn(b, blake3::portable::compress_in_place);
+    bench_single_compression_fn(b, Platform::portable());
 }
 
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_single_compression_sse41(b: &mut Bencher) {
-    if !blake3::platform::sse41_detected() {
-        return;
+    if let Some(platform) = Platform::sse41() {
+        bench_single_compression_fn(b, platform);
     }
-    bench_single_compression_fn(b, blake3::sse41::compress_in_place);
 }
 
 #[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 fn bench_single_compression_avx512(b: &mut Bencher) {
-    if !blake3::platform::avx512_detected() {
-        return;
+    if let Some(platform) = Platform::avx512() {
+        bench_single_compression_fn(b, platform);
     }
-    bench_single_compression_fn(b, blake3::c_avx512::compress_in_place);
 }
 
-type HashManyFn<A> = unsafe fn(
-    inputs: &[&A],
-    key: &[u32; 8],
-    counter: u64,
-    increment_counter: blake3::IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    out: &mut [u8],
-);
-
-fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn<[u8; CHUNK_LEN]>, degree: usize) {
+fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
+    let degree = platform.simd_degree();
     let mut inputs = Vec::new();
     for _ in 0..degree {
         inputs.push(RandomInput::new(b, CHUNK_LEN));
     }
-    unsafe {
-        b.iter(|| {
-            let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
-                .iter_mut()
-                .take(degree)
-                .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
-                .collect();
-            let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-            f(
-                &input_arrays[..],
-                &[0; 8],
-                0,
-                blake3::IncrementCounter::Yes,
-                0,
-                0,
-                0,
-                &mut out,
-            );
-        });
-    }
+    b.iter(|| {
+        let input_arrays: ArrayVec<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]> = inputs
+            .iter_mut()
+            .take(degree)
+            .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
+            .collect();
+        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+        platform.hash_many(
+            &input_arrays[..],
+            &[0; 8],
+            0,
+            blake3::IncrementCounter::Yes,
+            0,
+            0,
+            0,
+            &mut out,
+        );
+    });
 }
 
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_chunks_sse41(b: &mut Bencher) {
-    if !blake3::platform::sse41_detected() {
-        return;
+    if let Some(platform) = Platform::sse41() {
+        bench_many_chunks_fn(b, platform);
     }
-    bench_many_chunks_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
 }
 
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_chunks_avx2(b: &mut Bencher) {
-    if !blake3::platform::avx2_detected() {
-        return;
+    if let Some(platform) = Platform::avx2() {
+        bench_many_chunks_fn(b, platform);
     }
-    bench_many_chunks_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
 }
 
 #[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 fn bench_many_chunks_avx512(b: &mut Bencher) {
-    if !blake3::platform::avx512_detected() {
-        return;
+    if let Some(platform) = Platform::avx512() {
+        bench_many_chunks_fn(b, platform);
     }
-    bench_many_chunks_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
 }
 
 #[bench]
 #[cfg(feature = "c_neon")]
 fn bench_many_chunks_neon(b: &mut Bencher) {
-    // When "c_neon" is on, NEON support is assumed.
-    bench_many_chunks_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
+    if let Some(platform) = Platform::neon() {
+        bench_many_chunks_fn(b, platform);
+    }
 }
 
 // TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn<[u8; BLOCK_LEN]>, degree: usize) {
+fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
+    let degree = platform.simd_degree();
     let mut inputs = Vec::new();
     for _ in 0..degree {
         inputs.push(RandomInput::new(b, BLOCK_LEN));
     }
-    unsafe {
-        b.iter(|| {
-            let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
-                .iter_mut()
-                .take(degree)
-                .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
-                .collect();
-            let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-            f(
-                &input_arrays[..],
-                &[0; 8],
-                0,
-                blake3::IncrementCounter::No,
-                0,
-                0,
-                0,
-                &mut out,
-            );
-        });
-    }
+    b.iter(|| {
+        let input_arrays: ArrayVec<[&[u8; BLOCK_LEN]; MAX_SIMD_DEGREE]> = inputs
+            .iter_mut()
+            .take(degree)
+            .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
+            .collect();
+        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
+        platform.hash_many(
+            &input_arrays[..],
+            &[0; 8],
+            0,
+            blake3::IncrementCounter::No,
+            0,
+            0,
+            0,
+            &mut out,
+        );
+    });
 }
 
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_parents_sse41(b: &mut Bencher) {
-    if !blake3::platform::sse41_detected() {
-        return;
+    if let Some(platform) = Platform::sse41() {
+        bench_many_parents_fn(b, platform);
     }
-    bench_many_parents_fn(b, blake3::sse41::hash_many, blake3::sse41::DEGREE);
 }
 
 #[bench]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 fn bench_many_parents_avx2(b: &mut Bencher) {
-    if !blake3::platform::avx2_detected() {
-        return;
+    if let Some(platform) = Platform::avx2() {
+        bench_many_parents_fn(b, platform);
     }
-    bench_many_parents_fn(b, blake3::avx2::hash_many, blake3::avx2::DEGREE);
 }
 
 #[bench]
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 fn bench_many_parents_avx512(b: &mut Bencher) {
-    if !blake3::platform::avx512_detected() {
-        return;
+    if let Some(platform) = Platform::avx512() {
+        bench_many_parents_fn(b, platform);
     }
-    bench_many_parents_fn(b, blake3::c_avx512::hash_many, blake3::c_avx512::DEGREE);
 }
 
 #[bench]
 #[cfg(feature = "c_neon")]
 fn bench_many_parents_neon(b: &mut Bencher) {
-    // When "c_neon" is on, NEON support is assumed.
-    bench_many_parents_fn(b, blake3::c_neon::hash_many, blake3::c_neon::DEGREE);
+    if let Some(platform) = Platform::neon() {
+        bench_many_parents_fn(b, platform);
+    }
 }
 
 fn bench_atonce(b: &mut Bencher, len: usize) {
diff --git a/build.rs b/build.rs
index 67fe3fc..c5a662d 100644
--- a/build.rs
+++ b/build.rs
@@ -13,6 +13,11 @@ fn is_x86_64() -> bool {
     target_components()[0] == "x86_64"
 }
 
+fn is_x86_32() -> bool {
+    let arch = &target_components()[0];
+    arch == "i386" || arch == "i586" || arch == "i686"
+}
+
 fn is_armv7() -> bool {
     target_components()[0] == "armv7"
 }
@@ -28,6 +33,13 @@ fn is_windows_msvc() -> bool {
         && target_components()[3] == "msvc"
 }
 
+fn is_windows_gnu() -> bool {
+    // Some targets are only two components long, so check in steps.
+    target_components()[1] == "pc"
+        && target_components()[2] == "windows"
+        && target_components()[3] == "gnu"
+}
+
 fn new_build() -> cc::Build {
     let mut build = cc::Build::new();
     if !is_windows_msvc() {
@@ -37,16 +49,16 @@ fn new_build() -> cc::Build {
 }
 
 const WINDOWS_MSVC_ERROR: &str = r#"
-The "c_avx512" feature is enabled, but your version of the MSVC C compiler does
-not support the "/arch:AVX512" flag. If you are building the "b3sum" or
-"bao_bin" crates, you can disable AVX-512 with Cargo's "--no-default-features"
-flag. (Note that this also disables other default features like Rayon-based
+The "c" feature is enabled, but your version of the MSVC C compiler does not
+support the "/arch:AVX512" flag. If you are building the "b3sum" or "bao_bin"
+crates, you can disable AVX-512 with Cargo's "--no-default-features" flag.
+(Note that this also disables other default features like Rayon-based
 multithreading, which you can re-enable with "--features=rayon".) Other crates
 might or might not support this workaround.
 "#;
 
 const GNU_ERROR: &str = r#"
-The "c_avx512" feature is enabled, but your C compiler does not support the
+The "c" feature is enabled, but your C compiler does not support the
 "-mavx512f" flag. If you are building the "b3sum" or "bao_bin" crates, you can
 disable AVX-512 with Cargo's "--no-default-features" flag. (Note that this also
 disables other default features like Rayon-based multithreading, which you can
@@ -69,25 +81,76 @@ fn check_for_avx512_compiler_support(build: &cc::Build) {
 }
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // "c_avx512' is a no-op for non-x86_64 targets. It also participates in
-    // dynamic CPU feature detection, so it's generally safe to enable.
-    // However, it probably won't build in some older environments without
-    // AVX-512 support in the C compiler, and it's disabled by default for that
-    // reason.
-    if defined("CARGO_FEATURE_C_AVX512") && is_x86_64() {
-        let mut build = new_build();
-        check_for_avx512_compiler_support(&build);
-        build.file("c/blake3_avx512.c");
-        if is_windows_msvc() {
-            // Note that a lot of versions of MSVC don't support /arch:AVX512,
-            // and they'll discard it with a warning, hopefully leading to a
-            // build error.
-            build.flag("/arch:AVX512");
+    if defined("CARGO_FEATURE_C") {
+        if is_x86_64() && !defined("CARGO_FEATURE_C_PREFER_INTRINSICS") {
+            // On 64-bit, use the assembly implementations, unless the
+            // "c_prefer_intrinsics" feature is enabled.
+            if is_windows_msvc() {
+                let mut build = new_build();
+                build.file("c/blake3-sse41-x86_64-windows-msvc.asm");
+                build.file("c/blake3-avx2-x86_64-windows-msvc.asm");
+                build.file("c/blake3-avx512-x86_64-windows-msvc.asm");
+                build.compile("blake3_asm");
+            } else if is_windows_gnu() {
+                let mut build = new_build();
+                build.file("c/blake3-sse41-x86_64-windows-gnu.S");
+                build.file("c/blake3-avx2-x86_64-windows-gnu.S");
+                build.file("c/blake3-avx512-x86_64-windows-gnu.S");
+                build.compile("blake3_asm");
+            } else {
+                // All non-Windows implementations are assumed to support
+                // Linux-style assembly. These files do contain a small
+                // explicit workaround for macOS also.
+                let mut build = new_build();
+                build.file("c/blake3-sse41-x86_64-unix.S");
+                build.file("c/blake3-avx2-x86_64-unix.S");
+                build.file("c/blake3-avx512-x86_64-unix.S");
+                build.compile("blake3_asm");
+            }
+        } else if is_x86_64() || is_x86_32() {
+            // Assembly implementations are only for 64-bit. On 32-bit, or if
+            // the "c_prefer_intrinsics" feature is enabled, use the
+            // intrinsics-based C implementations. These each need to be
+            // compiled separately, with the corresponding instruction set
+            // extension explicitly enabled in the compiler.
+
+            let mut sse41_build = new_build();
+            sse41_build.file("c/blake3_sse41.c");
+            if is_windows_msvc() {
+                // /arch:SSE2 is the default on x86 and undefined on x86_64:
+                // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86
+                // It also includes SSE4.1 intrisincs:
+                // https://stackoverflow.com/a/32183222/823869
+            } else {
+                sse41_build.flag("-msse4.1");
+            }
+            sse41_build.compile("blake3_sse41");
+
+            let mut avx2_build = new_build();
+            avx2_build.file("c/blake3_avx2.c");
+            if is_windows_msvc() {
+                avx2_build.flag("/arch:AVX2");
+            } else {
+                avx2_build.flag("-mavx2");
+            }
+            avx2_build.compile("blake3_avx2");
+
+            let mut avx512_build = new_build();
+            check_for_avx512_compiler_support(&avx512_build);
+            avx512_build.file("c/blake3_avx512.c");
+            if is_windows_msvc() {
+                // Note that a lot of versions of MSVC don't support /arch:AVX512,
+                // and they'll discard it with a warning, hopefully leading to a
+                // build error.
+                avx512_build.flag("/arch:AVX512");
+            } else {
+                avx512_build.flag("-mavx512f");
+                avx512_build.flag("-mavx512vl");
+            }
+            avx512_build.compile("blake3_avx512");
         } else {
-            build.flag("-mavx512f");
-            build.flag("-mavx512vl");
+            // Currently no effect for non-x86 platforms.
         }
-        build.compile("blake3_avx512");
     }
 
     if defined("CARGO_FEATURE_C_NEON") {
diff --git a/src/avx2.rs b/src/avx2.rs
deleted file mode 100644
index 7f36072..0000000
--- a/src/avx2.rs
+++ /dev/null
@@ -1,474 +0,0 @@
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-use crate::{
-    counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
-};
-use arrayref::{array_mut_ref, mut_array_refs};
-
-pub const DEGREE: usize = 8;
-
-#[inline(always)]
-unsafe fn loadu(src: *const u8) -> __m256i {
-    // This is an unaligned load, so the pointer cast is allowed.
-    _mm256_loadu_si256(src as *const __m256i)
-}
-
-#[inline(always)]
-unsafe fn storeu(src: __m256i, dest: *mut u8) {
-    // This is an unaligned store, so the pointer cast is allowed.
-    _mm256_storeu_si256(dest as *mut __m256i, src)
-}
-
-#[inline(always)]
-unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
-    _mm256_add_epi32(a, b)
-}
-
-#[inline(always)]
-unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
-    _mm256_xor_si256(a, b)
-}
-
-#[inline(always)]
-unsafe fn set1(x: u32) -> __m256i {
-    _mm256_set1_epi32(x as i32)
-}
-
-#[inline(always)]
-unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i {
-    _mm256_setr_epi32(
-        a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32,
-    )
-}
-
-// These rotations are the "simple/shifts version". For the
-// "complicated/shuffles version", see
-// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
-// For a discussion of the tradeoffs, see
-// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
-// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
-// on recent x86 chips.
-
-#[inline(always)]
-unsafe fn rot16(x: __m256i) -> __m256i {
-    _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16))
-}
-
-#[inline(always)]
-unsafe fn rot12(x: __m256i) -> __m256i {
-    _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12))
-}
-
-#[inline(always)]
-unsafe fn rot8(x: __m256i) -> __m256i {
-    _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8))
-}
-
-#[inline(always)]
-unsafe fn rot7(x: __m256i) -> __m256i {
-    _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7))
-}
-
-#[inline(always)]
-unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) {
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
-    v[0] = add(v[0], v[4]);
-    v[1] = add(v[1], v[5]);
-    v[2] = add(v[2], v[6]);
-    v[3] = add(v[3], v[7]);
-    v[12] = xor(v[12], v[0]);
-    v[13] = xor(v[13], v[1]);
-    v[14] = xor(v[14], v[2]);
-    v[15] = xor(v[15], v[3]);
-    v[12] = rot16(v[12]);
-    v[13] = rot16(v[13]);
-    v[14] = rot16(v[14]);
-    v[15] = rot16(v[15]);
-    v[8] = add(v[8], v[12]);
-    v[9] = add(v[9], v[13]);
-    v[10] = add(v[10], v[14]);
-    v[11] = add(v[11], v[15]);
-    v[4] = xor(v[4], v[8]);
-    v[5] = xor(v[5], v[9]);
-    v[6] = xor(v[6], v[10]);
-    v[7] = xor(v[7], v[11]);
-    v[4] = rot12(v[4]);
-    v[5] = rot12(v[5]);
-    v[6] = rot12(v[6]);
-    v[7] = rot12(v[7]);
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
-    v[0] = add(v[0], v[4]);
-    v[1] = add(v[1], v[5]);
-    v[2] = add(v[2], v[6]);
-    v[3] = add(v[3], v[7]);
-    v[12] = xor(v[12], v[0]);
-    v[13] = xor(v[13], v[1]);
-    v[14] = xor(v[14], v[2]);
-    v[15] = xor(v[15], v[3]);
-    v[12] = rot8(v[12]);
-    v[13] = rot8(v[13]);
-    v[14] = rot8(v[14]);
-    v[15] = rot8(v[15]);
-    v[8] = add(v[8], v[12]);
-    v[9] = add(v[9], v[13]);
-    v[10] = add(v[10], v[14]);
-    v[11] = add(v[11], v[15]);
-    v[4] = xor(v[4], v[8]);
-    v[5] = xor(v[5], v[9]);
-    v[6] = xor(v[6], v[10]);
-    v[7] = xor(v[7], v[11]);
-    v[4] = rot7(v[4]);
-    v[5] = rot7(v[5]);
-    v[6] = rot7(v[6]);
-    v[7] = rot7(v[7]);
-
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
-    v[0] = add(v[0], v[5]);
-    v[1] = add(v[1], v[6]);
-    v[2] = add(v[2], v[7]);
-    v[3] = add(v[3], v[4]);
-    v[15] = xor(v[15], v[0]);
-    v[12] = xor(v[12], v[1]);
-    v[13] = xor(v[13], v[2]);
-    v[14] = xor(v[14], v[3]);
-    v[15] = rot16(v[15]);
-    v[12] = rot16(v[12]);
-    v[13] = rot16(v[13]);
-    v[14] = rot16(v[14]);
-    v[10] = add(v[10], v[15]);
-    v[11] = add(v[11], v[12]);
-    v[8] = add(v[8], v[13]);
-    v[9] = add(v[9], v[14]);
-    v[5] = xor(v[5], v[10]);
-    v[6] = xor(v[6], v[11]);
-    v[7] = xor(v[7], v[8]);
-    v[4] = xor(v[4], v[9]);
-    v[5] = rot12(v[5]);
-    v[6] = rot12(v[6]);
-    v[7] = rot12(v[7]);
-    v[4] = rot12(v[4]);
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
-    v[0] = add(v[0], v[5]);
-    v[1] = add(v[1], v[6]);
-    v[2] = add(v[2], v[7]);
-    v[3] = add(v[3], v[4]);
-    v[15] = xor(v[15], v[0]);
-    v[12] = xor(v[12], v[1]);
-    v[13] = xor(v[13], v[2]);
-    v[14] = xor(v[14], v[3]);
-    v[15] = rot8(v[15]);
-    v[12] = rot8(v[12]);
-    v[13] = rot8(v[13]);
-    v[14] = rot8(v[14]);
-    v[10] = add(v[10], v[15]);
-    v[11] = add(v[11], v[12]);
-    v[8] = add(v[8], v[13]);
-    v[9] = add(v[9], v[14]);
-    v[5] = xor(v[5], v[10]);
-    v[6] = xor(v[6], v[11]);
-    v[7] = xor(v[7], v[8]);
-    v[4] = xor(v[4], v[9]);
-    v[5] = rot7(v[5]);
-    v[6] = rot7(v[6]);
-    v[7] = rot7(v[7]);
-    v[4] = rot7(v[4]);
-}
-
-#[inline(always)]
-unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
-    (
-        _mm256_permute2x128_si256(a, b, 0x20),
-        _mm256_permute2x128_si256(a, b, 0x31),
-    )
-}
-
-// There are several ways to do a transposition. We could do it naively, with 8 separate
-// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy
-// the vecs into contiguous storage and then use gather instructions. This third approach is to use
-// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
-// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the
-// https://github.com/oconnor663/bao_experiments repo.
-#[inline(always)]
-unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) {
-    // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77.
-    let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
-    let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
-    let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
-    let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
-    let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
-    let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
-    let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
-    let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
-
-    // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33.
-    let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
-    let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
-    let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
-    let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
-    let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
-    let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
-    let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
-    let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
-
-    // Interleave 128-bit lanes.
-    let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04);
-    let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15);
-    let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26);
-    let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37);
-
-    vecs[0] = abcdefgh_0;
-    vecs[1] = abcdefgh_1;
-    vecs[2] = abcdefgh_2;
-    vecs[3] = abcdefgh_3;
-    vecs[4] = abcdefgh_4;
-    vecs[5] = abcdefgh_5;
-    vecs[6] = abcdefgh_6;
-    vecs[7] = abcdefgh_7;
-}
-
-#[inline(always)]
-unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] {
-    let mut vecs = [
-        loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)),
-    ];
-    for i in 0..DEGREE {
-        _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
-    }
-    let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE);
-    transpose_vecs(squares.0);
-    transpose_vecs(squares.1);
-    vecs
-}
-
-#[inline(always)]
-unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) {
-    let mask = if increment_counter.yes() { !0 } else { 0 };
-    (
-        set8(
-            counter_low(counter + (mask & 0)),
-            counter_low(counter + (mask & 1)),
-            counter_low(counter + (mask & 2)),
-            counter_low(counter + (mask & 3)),
-            counter_low(counter + (mask & 4)),
-            counter_low(counter + (mask & 5)),
-            counter_low(counter + (mask & 6)),
-            counter_low(counter + (mask & 7)),
-        ),
-        set8(
-            counter_high(counter + (mask & 0)),
-            counter_high(counter + (mask & 1)),
-            counter_high(counter + (mask & 2)),
-            counter_high(counter + (mask & 3)),
-            counter_high(counter + (mask & 4)),
-            counter_high(counter + (mask & 5)),
-            counter_high(counter + (mask & 6)),
-            counter_high(counter + (mask & 7)),
-        ),
-    )
-}
-
-#[target_feature(enable = "avx2")]
-pub unsafe fn hash8(
-    inputs: &[*const u8; DEGREE],
-    blocks: usize,
-    key: &CVWords,
-    counter: u64,
-    increment_counter: IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    out: &mut [u8; DEGREE * OUT_LEN],
-) {
-    let mut h_vecs = [
-        set1(key[0]),
-        set1(key[1]),
-        set1(key[2]),
-        set1(key[3]),
-        set1(key[4]),
-        set1(key[5]),
-        set1(key[6]),
-        set1(key[7]),
-    ];
-    let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
-    let mut block_flags = flags | flags_start;
-
-    for block in 0..blocks {
-        if block + 1 == blocks {
-            block_flags |= flags_end;
-        }
-        let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
-        let block_flags_vec = set1(block_flags as u32);
-        let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
-
-        // The transposed compression function. Note that inlining this
-        // manually here improves compile times by a lot, compared to factoring
-        // it out into its own function and making it #[inline(always)]. Just
-        // guessing, it might have something to do with loop unrolling.
-        let mut v = [
-            h_vecs[0],
-            h_vecs[1],
-            h_vecs[2],
-            h_vecs[3],
-            h_vecs[4],
-            h_vecs[5],
-            h_vecs[6],
-            h_vecs[7],
-            set1(IV[0]),
-            set1(IV[1]),
-            set1(IV[2]),
-            set1(IV[3]),
-            counter_low_vec,
-            counter_high_vec,
-            block_len_vec,
-            block_flags_vec,
-        ];
-        round(&mut v, &msg_vecs, 0);
-        round(&mut v, &msg_vecs, 1);
-        round(&mut v, &msg_vecs, 2);
-        round(&mut v, &msg_vecs, 3);
-        round(&mut v, &msg_vecs, 4);
-        round(&mut v, &msg_vecs, 5);
-        round(&mut v, &msg_vecs, 6);
-        h_vecs[0] = xor(v[0], v[8]);
-        h_vecs[1] = xor(v[1], v[9]);
-        h_vecs[2] = xor(v[2], v[10]);
-        h_vecs[3] = xor(v[3], v[11]);
-        h_vecs[4] = xor(v[4], v[12]);
-        h_vecs[5] = xor(v[5], v[13]);
-        h_vecs[6] = xor(v[6], v[14]);
-        h_vecs[7] = xor(v[7], v[15]);
-
-        block_flags = flags;
-    }
-
-    transpose_vecs(&mut h_vecs);
-    storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
-    storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE));
-    storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE));
-    storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE));
-    storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE));
-    storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE));
-    storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE));
-    storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
-}
-
-#[target_feature(enable = "avx2")]
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
-    mut inputs: &[&A],
-    key: &CVWords,
-    mut counter: u64,
-    increment_counter: IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    mut out: &mut [u8],
-) {
-    debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
-    while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
-        // Safe because the layout of arrays is guaranteed, and because the
-        // `blocks` count is determined statically from the argument type.
-        let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
-        let blocks = A::CAPACITY / BLOCK_LEN;
-        hash8(
-            input_ptrs,
-            blocks,
-            key,
-            counter,
-            increment_counter,
-            flags,
-            flags_start,
-            flags_end,
-            array_mut_ref!(out, 0, DEGREE * OUT_LEN),
-        );
-        if increment_counter.yes() {
-            counter += DEGREE as u64;
-        }
-        inputs = &inputs[DEGREE..];
-        out = &mut out[DEGREE * OUT_LEN..];
-    }
-    crate::sse41::hash_many(
-        inputs,
-        key,
-        counter,
-        increment_counter,
-        flags,
-        flags_start,
-        flags_end,
-        out,
-    );
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn test_transpose() {
-        if !crate::platform::avx2_detected() {
-            return;
-        }
-
-        #[target_feature(enable = "avx2")]
-        unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) {
-            transpose_vecs(vecs);
-        }
-
-        let mut matrix = [[0 as u32; DEGREE]; DEGREE];
-        for i in 0..DEGREE {
-            for j in 0..DEGREE {
-                matrix[i][j] = (i * DEGREE + j) as u32;
-            }
-        }
-
-        unsafe {
-            let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix);
-            transpose_wrapper(&mut vecs);
-            matrix = core::mem::transmute(vecs);
-        }
-
-        for i in 0..DEGREE {
-            for j in 0..DEGREE {
-                // Reversed indexes from above.
-                assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
-            }
-        }
-    }
-
-    #[test]
-    fn test_hash_many() {
-        if !crate::platform::avx2_detected() {
-            return;
-        }
-        crate::test::test_hash_many_fn(hash_many, hash_many);
-    }
-}
diff --git a/src/c_avx2.rs b/src/c_avx2.rs
new file mode 100644
index 0000000..d805e86
--- /dev/null
+++ b/src/c_avx2.rs
@@ -0,0 +1,63 @@
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
+
+// Note that there is no AVX2 implementation of compress_in_place or
+// compress_xof.
+
+// Unsafe because this may only be called on platforms supporting AVX2.
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+    inputs: &[&A],
+    key: &CVWords,
+    counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: &mut [u8],
+) {
+    // The Rust hash_many implementations do bounds checking on the `out`
+    // array, but the C implementations don't. Even though this is an unsafe
+    // function, assert the bounds here.
+    assert!(out.len() >= inputs.len() * OUT_LEN);
+    ffi::blake3_hash_many_avx2(
+        inputs.as_ptr() as *const *const u8,
+        inputs.len(),
+        A::CAPACITY / BLOCK_LEN,
+        key.as_ptr(),
+        counter,
+        increment_counter.yes(),
+        flags,
+        flags_start,
+        flags_end,
+        out.as_mut_ptr(),
+    )
+}
+
+pub mod ffi {
+    extern "C" {
+        pub fn blake3_hash_many_avx2(
+            inputs: *const *const u8,
+            num_inputs: usize,
+            blocks: usize,
+            key: *const u32,
+            counter: u64,
+            increment_counter: bool,
+            flags: u8,
+            flags_start: u8,
+            flags_end: u8,
+            out: *mut u8,
+        );
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_hash_many() {
+        if !crate::platform::avx2_detected() {
+            return;
+        }
+        crate::test::test_hash_many_fn(hash_many, hash_many);
+    }
+}
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
index f20de2c..c1b9f64 100644
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@@ -1,7 +1,5 @@
 use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 
-pub const DEGREE: usize = 16;
-
 // Unsafe because this may only be called on platforms supporting AVX-512.
 pub unsafe fn compress_in_place(
     cv: &mut CVWords,
@@ -91,7 +89,6 @@ pub mod ffi {
             flags_end: u8,
             out: *mut u8,
         );
-
     }
 }
 
diff --git a/src/c_neon.rs b/src/c_neon.rs
index 34ef074..77b9654 100644
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@@ -1,7 +1,5 @@
 use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 
-pub const DEGREE: usize = 4;
-
 // Unsafe because this may only be called on platforms supporting NEON.
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
diff --git a/src/c_sse41.rs b/src/c_sse41.rs
new file mode 100644
index 0000000..0b64c90
--- /dev/null
+++ b/src/c_sse41.rs
@@ -0,0 +1,114 @@
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
+
+// Unsafe because this may only be called on platforms supporting SSE4.1.
+pub unsafe fn compress_in_place(
+    cv: &mut CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) {
+    ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
+}
+
+// Unsafe because this may only be called on platforms supporting SSE4.1.
+pub unsafe fn compress_xof(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) -> [u8; 64] {
+    let mut out = [0u8; 64];
+    ffi::blake3_compress_xof_sse41(
+        cv.as_ptr(),
+        block.as_ptr(),
+        block_len,
+        counter,
+        flags,
+        out.as_mut_ptr(),
+    );
+    out
+}
+
+// Unsafe because this may only be called on platforms supporting SSE4.1.
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+    inputs: &[&A],
+    key: &CVWords,
+    counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: &mut [u8],
+) {
+    // The Rust hash_many implementations do bounds checking on the `out`
+    // array, but the C implementations don't. Even though this is an unsafe
+    // function, assert the bounds here.
+    assert!(out.len() >= inputs.len() * OUT_LEN);
+    ffi::blake3_hash_many_sse41(
+        inputs.as_ptr() as *const *const u8,
+        inputs.len(),
+        A::CAPACITY / BLOCK_LEN,
+        key.as_ptr(),
+        counter,
+        increment_counter.yes(),
+        flags,
+        flags_start,
+        flags_end,
+        out.as_mut_ptr(),
+    )
+}
+
+pub mod ffi {
+    extern "C" {
+        pub fn blake3_compress_in_place_sse41(
+            cv: *mut u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+        );
+        pub fn blake3_compress_xof_sse41(
+            cv: *const u32,
+            block: *const u8,
+            block_len: u8,
+            counter: u64,
+            flags: u8,
+            out: *mut u8,
+        );
+        pub fn blake3_hash_many_sse41(
+            inputs: *const *const u8,
+            num_inputs: usize,
+            blocks: usize,
+            key: *const u32,
+            counter: u64,
+            increment_counter: bool,
+            flags: u8,
+            flags_start: u8,
+            flags_end: u8,
+            out: *mut u8,
+        );
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_compress() {
+        if !crate::platform::sse41_detected() {
+            return;
+        }
+        crate::test::test_compress_fn(compress_in_place, compress_xof);
+    }
+
+    #[test]
+    fn test_hash_many() {
+        if !crate::platform::sse41_detected() {
+            return;
+        }
+        crate::test::test_hash_many_fn(hash_many, hash_many);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7fa3510..58d2dbe 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -39,24 +39,32 @@ mod test;
 #[doc(hidden)]
 pub mod guts;
 
-// These modules are pub for benchmarks only. They are not stable.
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
-pub mod avx2;
-#[cfg(feature = "c_avx512")]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
-pub mod c_avx512;
-#[cfg(feature = "c_neon")]
-#[doc(hidden)]
-pub mod c_neon;
+// The platform module is pub for benchmarks only. It is not stable.
 #[doc(hidden)]
 pub mod platform;
-#[doc(hidden)]
-pub mod portable;
+
+// Platform-specific implementations of the compression function.
+mod portable;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[doc(hidden)]
-pub mod sse41;
+cfg_if::cfg_if! {
+    if #[cfg(feature = "c")] {
+        #[path = "c_sse41.rs"]
+        mod sse41;
+        #[path = "c_avx2.rs"]
+        mod avx2;
+        #[path = "c_avx512.rs"]
+        mod avx512;
+    } else {
+        #[path = "rust_sse41.rs"]
+        mod sse41;
+        #[path = "rust_avx2.rs"]
+        mod avx2;
+        // Stable Rust does not currently support AVX-512.
+    }
+}
+#[cfg(feature = "c_neon")]
+#[path = "c_neon.rs"]
+mod neon;
 
 pub mod traits;
 
diff --git a/src/platform.rs b/src/platform.rs
index b453a6e..163cbbb 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,18 +1,10 @@
 use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
 use arrayref::{array_mut_ref, array_ref};
 
-#[cfg(feature = "c_avx512")]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::c_avx512;
-#[cfg(feature = "c_neon")]
-use crate::c_neon;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use crate::{avx2, sse41};
-
 cfg_if::cfg_if! {
     if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
         cfg_if::cfg_if! {
-            if #[cfg(feature = "c_avx512")] {
+            if #[cfg(feature = "c")] {
                 pub const MAX_SIMD_DEGREE: usize = 16;
             } else {
                 pub const MAX_SIMD_DEGREE: usize = 8;
@@ -32,7 +24,7 @@ cfg_if::cfg_if! {
 cfg_if::cfg_if! {
     if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
         cfg_if::cfg_if! {
-            if #[cfg(feature = "c_avx512")] {
+            if #[cfg(feature = "c")] {
                 pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
             } else {
                 pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
@@ -52,7 +44,7 @@ pub enum Platform {
     SSE41,
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     AVX2,
-    #[cfg(feature = "c_avx512")]
+    #[cfg(feature = "c")]
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     AVX512,
     #[cfg(feature = "c_neon")]
@@ -64,7 +56,7 @@ impl Platform {
     pub fn detect() -> Self {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
             {
                 if avx512_detected() {
                     return Platform::AVX512;
@@ -93,7 +85,7 @@ impl Platform {
             Platform::SSE41 => 4,
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 => 8,
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => 16,
             #[cfg(feature = "c_neon")]
@@ -103,7 +95,7 @@ impl Platform {
         degree
     }
 
-    pub(crate) fn compress_in_place(
+    pub fn compress_in_place(
         &self,
         cv: &mut CVWords,
         block: &[u8; BLOCK_LEN],
@@ -116,13 +108,13 @@ impl Platform {
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress_in_place(cv, block, block_len, counter, flags)
+                crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
             },
             // Safe because detect() checked for platform support.
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
-                c_avx512::compress_in_place(cv, block, block_len, counter, flags)
+                crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
             },
             // No NEON compress_in_place() implementation yet.
             #[cfg(feature = "c_neon")]
@@ -130,7 +122,7 @@ impl Platform {
         }
     }
 
-    pub(crate) fn compress_xof(
+    pub fn compress_xof(
         &self,
         cv: &CVWords,
         block: &[u8; BLOCK_LEN],
@@ -143,13 +135,13 @@ impl Platform {
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress_xof(cv, block, block_len, counter, flags)
+                crate::sse41::compress_xof(cv, block, block_len, counter, flags)
             },
             // Safe because detect() checked for platform support.
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
-                c_avx512::compress_xof(cv, block, block_len, counter, flags)
+                crate::avx512::compress_xof(cv, block, block_len, counter, flags)
             },
             // No NEON compress_xof() implementation yet.
             #[cfg(feature = "c_neon")]
@@ -167,7 +159,7 @@ impl Platform {
     // after every block, there's a small but measurable performance loss.
     // Compressing chunks with a dedicated loop avoids this.
 
-    pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>(
+    pub fn hash_many<A: arrayvec::Array<Item = u8>>(
         &self,
         inputs: &[&A],
         key: &CVWords,
@@ -192,7 +184,7 @@ impl Platform {
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 => unsafe {
-                sse41::hash_many(
+                crate::sse41::hash_many(
                     inputs,
                     key,
                     counter,
@@ -206,7 +198,7 @@ impl Platform {
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX2 => unsafe {
-                avx2::hash_many(
+                crate::avx2::hash_many(
                     inputs,
                     key,
                     counter,
@@ -218,10 +210,10 @@ impl Platform {
                 )
             },
             // Safe because detect() checked for platform support.
-            #[cfg(feature = "c_avx512")]
+            #[cfg(feature = "c")]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
-                c_avx512::hash_many(
+                crate::avx512::hash_many(
                     inputs,
                     key,
                     counter,
@@ -235,7 +227,7 @@ impl Platform {
             // Assumed to be safe if the "c_neon" feature is on.
             #[cfg(feature = "c_neon")]
             Platform::NEON => unsafe {
-                c_neon::hash_many(
+                crate::neon::hash_many(
                     inputs,
                     key,
                     counter,
@@ -248,11 +240,52 @@ impl Platform {
             },
         }
     }
+
+    // Explicit platform constructors, for benchmarks.
+
+    pub fn portable() -> Self {
+        Self::Portable
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub fn sse41() -> Option<Self> {
+        if sse41_detected() {
+            Some(Self::SSE41)
+        } else {
+            None
+        }
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub fn avx2() -> Option<Self> {
+        if avx2_detected() {
+            Some(Self::AVX2)
+        } else {
+            None
+        }
+    }
+
+    #[cfg(feature = "c")]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub fn avx512() -> Option<Self> {
+        if avx512_detected() {
+            Some(Self::AVX512)
+        } else {
+            None
+        }
+    }
+
+    #[cfg(feature = "c_neon")]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub fn neon() -> Option<Self> {
+        // Assumed to be safe if the "c_neon" feature is on.
+        Some(Self::NEON)
+    }
 }
 
 // Note that AVX-512 is divided into multiple featuresets, and we use two of
 // them, F and VL.
-#[cfg(feature = "c_avx512")]
+#[cfg(feature = "c")]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 #[inline(always)]
 pub fn avx512_detected() -> bool {
diff --git a/src/rust_avx2.rs b/src/rust_avx2.rs
new file mode 100644
index 0000000..7f36072
--- /dev/null
+++ b/src/rust_avx2.rs
@@ -0,0 +1,474 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::{
+    counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
+use arrayref::{array_mut_ref, mut_array_refs};
+
+pub const DEGREE: usize = 8;
+
+#[inline(always)]
+unsafe fn loadu(src: *const u8) -> __m256i {
+    // This is an unaligned load, so the pointer cast is allowed.
+    _mm256_loadu_si256(src as *const __m256i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m256i, dest: *mut u8) {
+    // This is an unaligned store, so the pointer cast is allowed.
+    _mm256_storeu_si256(dest as *mut __m256i, src)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
+    _mm256_add_epi32(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
+    _mm256_xor_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn set1(x: u32) -> __m256i {
+    _mm256_set1_epi32(x as i32)
+}
+
+#[inline(always)]
+unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i {
+    _mm256_setr_epi32(
+        a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32,
+    )
+}
+
+// These rotations are the "simple/shifts version". For the
+// "complicated/shuffles version", see
+// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
+// For a discussion of the tradeoffs, see
+// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
+// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
+// on recent x86 chips.
+
+#[inline(always)]
+unsafe fn rot16(x: __m256i) -> __m256i {
+    _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16))
+}
+
+#[inline(always)]
+unsafe fn rot12(x: __m256i) -> __m256i {
+    _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12))
+}
+
+#[inline(always)]
+unsafe fn rot8(x: __m256i) -> __m256i {
+    _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8))
+}
+
+#[inline(always)]
+unsafe fn rot7(x: __m256i) -> __m256i {
+    _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7))
+}
+
+#[inline(always)]
+unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) {
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[15] = rot16(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot12(v[4]);
+    v[5] = rot12(v[5]);
+    v[6] = rot12(v[6]);
+    v[7] = rot12(v[7]);
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot8(v[12]);
+    v[13] = rot8(v[13]);
+    v[14] = rot8(v[14]);
+    v[15] = rot8(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot7(v[4]);
+    v[5] = rot7(v[5]);
+    v[6] = rot7(v[6]);
+    v[7] = rot7(v[7]);
+
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot16(v[15]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot12(v[5]);
+    v[6] = rot12(v[6]);
+    v[7] = rot12(v[7]);
+    v[4] = rot12(v[4]);
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot8(v[15]);
+    v[12] = rot8(v[12]);
+    v[13] = rot8(v[13]);
+    v[14] = rot8(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot7(v[5]);
+    v[6] = rot7(v[6]);
+    v[7] = rot7(v[7]);
+    v[4] = rot7(v[4]);
+}
+
+#[inline(always)]
+unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
+    (
+        _mm256_permute2x128_si256(a, b, 0x20),
+        _mm256_permute2x128_si256(a, b, 0x31),
+    )
+}
+
+// There are several ways to do a transposition. We could do it naively, with 8 separate
+// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy
+// the vecs into contiguous storage and then use gather instructions. This third approach is to use
+// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
+// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the
+// https://github.com/oconnor663/bao_experiments repo.
+#[inline(always)]
+unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) {
+    // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77.
+    let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+    let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+    let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+    let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+    let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+    let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+    let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+    let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+    // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33.
+    let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+    let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+    let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+    let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+    let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+    let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+    let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+    let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+    // Interleave 128-bit lanes.
+    let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04);
+    let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15);
+    let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26);
+    let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37);
+
+    vecs[0] = abcdefgh_0;
+    vecs[1] = abcdefgh_1;
+    vecs[2] = abcdefgh_2;
+    vecs[3] = abcdefgh_3;
+    vecs[4] = abcdefgh_4;
+    vecs[5] = abcdefgh_5;
+    vecs[6] = abcdefgh_6;
+    vecs[7] = abcdefgh_7;
+}
+
+#[inline(always)]
+unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] {
+    let mut vecs = [
+        loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)),
+    ];
+    for i in 0..DEGREE {
+        _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
+    }
+    let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE);
+    transpose_vecs(squares.0);
+    transpose_vecs(squares.1);
+    vecs
+}
+
+#[inline(always)]
+unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) {
+    let mask = if increment_counter.yes() { !0 } else { 0 };
+    (
+        set8(
+            counter_low(counter + (mask & 0)),
+            counter_low(counter + (mask & 1)),
+            counter_low(counter + (mask & 2)),
+            counter_low(counter + (mask & 3)),
+            counter_low(counter + (mask & 4)),
+            counter_low(counter + (mask & 5)),
+            counter_low(counter + (mask & 6)),
+            counter_low(counter + (mask & 7)),
+        ),
+        set8(
+            counter_high(counter + (mask & 0)),
+            counter_high(counter + (mask & 1)),
+            counter_high(counter + (mask & 2)),
+            counter_high(counter + (mask & 3)),
+            counter_high(counter + (mask & 4)),
+            counter_high(counter + (mask & 5)),
+            counter_high(counter + (mask & 6)),
+            counter_high(counter + (mask & 7)),
+        ),
+    )
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn hash8(
+    inputs: &[*const u8; DEGREE],
+    blocks: usize,
+    key: &CVWords,
+    counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: &mut [u8; DEGREE * OUT_LEN],
+) {
+    let mut h_vecs = [
+        set1(key[0]),
+        set1(key[1]),
+        set1(key[2]),
+        set1(key[3]),
+        set1(key[4]),
+        set1(key[5]),
+        set1(key[6]),
+        set1(key[7]),
+    ];
+    let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
+    let mut block_flags = flags | flags_start;
+
+    for block in 0..blocks {
+        if block + 1 == blocks {
+            block_flags |= flags_end;
+        }
+        let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
+        let block_flags_vec = set1(block_flags as u32);
+        let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
+
+        // The transposed compression function. Note that inlining this
+        // manually here improves compile times by a lot, compared to factoring
+        // it out into its own function and making it #[inline(always)]. Just
+        // guessing, it might have something to do with loop unrolling.
+        let mut v = [
+            h_vecs[0],
+            h_vecs[1],
+            h_vecs[2],
+            h_vecs[3],
+            h_vecs[4],
+            h_vecs[5],
+            h_vecs[6],
+            h_vecs[7],
+            set1(IV[0]),
+            set1(IV[1]),
+            set1(IV[2]),
+            set1(IV[3]),
+            counter_low_vec,
+            counter_high_vec,
+            block_len_vec,
+            block_flags_vec,
+        ];
+        round(&mut v, &msg_vecs, 0);
+        round(&mut v, &msg_vecs, 1);
+        round(&mut v, &msg_vecs, 2);
+        round(&mut v, &msg_vecs, 3);
+        round(&mut v, &msg_vecs, 4);
+        round(&mut v, &msg_vecs, 5);
+        round(&mut v, &msg_vecs, 6);
+        h_vecs[0] = xor(v[0], v[8]);
+        h_vecs[1] = xor(v[1], v[9]);
+        h_vecs[2] = xor(v[2], v[10]);
+        h_vecs[3] = xor(v[3], v[11]);
+        h_vecs[4] = xor(v[4], v[12]);
+        h_vecs[5] = xor(v[5], v[13]);
+        h_vecs[6] = xor(v[6], v[14]);
+        h_vecs[7] = xor(v[7], v[15]);
+
+        block_flags = flags;
+    }
+
+    transpose_vecs(&mut h_vecs);
+    storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
+    storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE));
+    storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE));
+    storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE));
+    storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE));
+    storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE));
+    storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE));
+    storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+    mut inputs: &[&A],
+    key: &CVWords,
+    mut counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    mut out: &mut [u8],
+) {
+    debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
+    while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
+        // Safe because the layout of arrays is guaranteed, and because the
+        // `blocks` count is determined statically from the argument type.
+        let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
+        let blocks = A::CAPACITY / BLOCK_LEN;
+        hash8(
+            input_ptrs,
+            blocks,
+            key,
+            counter,
+            increment_counter,
+            flags,
+            flags_start,
+            flags_end,
+            array_mut_ref!(out, 0, DEGREE * OUT_LEN),
+        );
+        if increment_counter.yes() {
+            counter += DEGREE as u64;
+        }
+        inputs = &inputs[DEGREE..];
+        out = &mut out[DEGREE * OUT_LEN..];
+    }
+    crate::sse41::hash_many(
+        inputs,
+        key,
+        counter,
+        increment_counter,
+        flags,
+        flags_start,
+        flags_end,
+        out,
+    );
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_transpose() {
+        if !crate::platform::avx2_detected() {
+            return;
+        }
+
+        #[target_feature(enable = "avx2")]
+        unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) {
+            transpose_vecs(vecs);
+        }
+
+        let mut matrix = [[0 as u32; DEGREE]; DEGREE];
+        for i in 0..DEGREE {
+            for j in 0..DEGREE {
+                matrix[i][j] = (i * DEGREE + j) as u32;
+            }
+        }
+
+        unsafe {
+            let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix);
+            transpose_wrapper(&mut vecs);
+            matrix = core::mem::transmute(vecs);
+        }
+
+        for i in 0..DEGREE {
+            for j in 0..DEGREE {
+                // Reversed indexes from above.
+                assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
+            }
+        }
+    }
+
+    #[test]
+    fn test_hash_many() {
+        if !crate::platform::avx2_detected() {
+            return;
+        }
+        crate::test::test_hash_many_fn(hash_many, hash_many);
+    }
+}
diff --git a/src/rust_sse41.rs b/src/rust_sse41.rs
new file mode 100644
index 0000000..fcf2f98
--- /dev/null
+++ b/src/rust_sse41.rs
@@ -0,0 +1,766 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use crate::{
+    counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
+    OUT_LEN,
+};
+use arrayref::{array_mut_ref, array_ref, mut_array_refs};
+
+pub const DEGREE: usize = 4;
+
+#[inline(always)]
+unsafe fn loadu(src: *const u8) -> __m128i {
+    // This is an unaligned load, so the pointer cast is allowed.
+    _mm_loadu_si128(src as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m128i, dest: *mut u8) {
+    // This is an unaligned store, so the pointer cast is allowed.
+    _mm_storeu_si128(dest as *mut __m128i, src)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
+    _mm_add_epi32(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
+    _mm_xor_si128(a, b)
+}
+
+#[inline(always)]
+unsafe fn set1(x: u32) -> __m128i {
+    _mm_set1_epi32(x as i32)
+}
+
+#[inline(always)]
+unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
+    _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
+}
+
+// These rotations are the "simple/shifts version". For the
+// "complicated/shuffles version", see
+// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
+// For a discussion of the tradeoffs, see
+// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
+// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
+// on recent x86 chips.
+
+#[inline(always)]
+unsafe fn rot16(a: __m128i) -> __m128i {
+    _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
+}
+
+#[inline(always)]
+unsafe fn rot12(a: __m128i) -> __m128i {
+    _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
+}
+
+#[inline(always)]
+unsafe fn rot8(a: __m128i) -> __m128i {
+    _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
+}
+
+#[inline(always)]
+unsafe fn rot7(a: __m128i) -> __m128i {
+    _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
+}
+
+#[inline(always)]
+unsafe fn g1(
+    row0: &mut __m128i,
+    row1: &mut __m128i,
+    row2: &mut __m128i,
+    row3: &mut __m128i,
+    m: __m128i,
+) {
+    *row0 = add(add(*row0, m), *row1);
+    *row3 = xor(*row3, *row0);
+    *row3 = rot16(*row3);
+    *row2 = add(*row2, *row3);
+    *row1 = xor(*row1, *row2);
+    *row1 = rot12(*row1);
+}
+
+#[inline(always)]
+unsafe fn g2(
+    row0: &mut __m128i,
+    row1: &mut __m128i,
+    row2: &mut __m128i,
+    row3: &mut __m128i,
+    m: __m128i,
+) {
+    *row0 = add(add(*row0, m), *row1);
+    *row3 = xor(*row3, *row0);
+    *row3 = rot8(*row3);
+    *row2 = add(*row2, *row3);
+    *row1 = xor(*row1, *row2);
+    *row1 = rot7(*row1);
+}
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+    ($z:expr, $y:expr, $x:expr, $w:expr) => {
+        ($z << 6) | ($y << 4) | ($x << 2) | $w
+    };
+}
+
+macro_rules! shuffle2 {
+    ($a:expr, $b:expr, $c:expr) => {
+        _mm_castps_si128(_mm_shuffle_ps(
+            _mm_castsi128_ps($a),
+            _mm_castsi128_ps($b),
+            $c,
+        ))
+    };
+}
+
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
+    *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
+    *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
+    *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
+}
+
+#[inline(always)]
+unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
+    *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
+    *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
+    *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
+}
+
+#[inline(always)]
+unsafe fn compress_pre(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) -> [__m128i; 4] {
+    let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
+    let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
+    let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
+    let row3 = &mut set4(
+        counter_low(counter),
+        counter_high(counter),
+        block_len as u32,
+        flags as u32,
+    );
+
+    let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
+    let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
+    let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
+    let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
+
+    let mut t0;
+    let mut t1;
+    let mut t2;
+    let mut t3;
+    let mut tt;
+
+    // Round 1. The first round permutes the message words from the original
+    // input order, into the groups that get mixed in parallel.
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); //  6  4  2  0
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); //  7  5  3  1
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10  8
+    t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10  8 14
+    g1(row0, row1, row2, row3, t2);
+    t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11  9
+    t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11  9 15
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+    m0 = t0;
+    m1 = t1;
+    m2 = t2;
+    m3 = t3;
+
+    // Round 2. This round and all following rounds apply a fixed permutation
+    // to the message words from the round before.
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
+    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
+    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
+    t1 = _mm_blend_epi16(tt, t1, 0xCC);
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = _mm_unpacklo_epi64(m3, m1);
+    tt = _mm_blend_epi16(t2, m2, 0xC0);
+    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
+    g1(row0, row1, row2, row3, t2);
+    t3 = _mm_unpackhi_epi32(m1, m3);
+    tt = _mm_unpacklo_epi32(m2, t3);
+    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+    m0 = t0;
+    m1 = t1;
+    m2 = t2;
+    m3 = t3;
+
+    // Round 3
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
+    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
+    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
+    t1 = _mm_blend_epi16(tt, t1, 0xCC);
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = _mm_unpacklo_epi64(m3, m1);
+    tt = _mm_blend_epi16(t2, m2, 0xC0);
+    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
+    g1(row0, row1, row2, row3, t2);
+    t3 = _mm_unpackhi_epi32(m1, m3);
+    tt = _mm_unpacklo_epi32(m2, t3);
+    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+    m0 = t0;
+    m1 = t1;
+    m2 = t2;
+    m3 = t3;
+
+    // Round 4
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
+    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
+    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
+    t1 = _mm_blend_epi16(tt, t1, 0xCC);
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = _mm_unpacklo_epi64(m3, m1);
+    tt = _mm_blend_epi16(t2, m2, 0xC0);
+    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
+    g1(row0, row1, row2, row3, t2);
+    t3 = _mm_unpackhi_epi32(m1, m3);
+    tt = _mm_unpacklo_epi32(m2, t3);
+    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+    m0 = t0;
+    m1 = t1;
+    m2 = t2;
+    m3 = t3;
+
+    // Round 5
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
+    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
+    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
+    t1 = _mm_blend_epi16(tt, t1, 0xCC);
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = _mm_unpacklo_epi64(m3, m1);
+    tt = _mm_blend_epi16(t2, m2, 0xC0);
+    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
+    g1(row0, row1, row2, row3, t2);
+    t3 = _mm_unpackhi_epi32(m1, m3);
+    tt = _mm_unpacklo_epi32(m2, t3);
+    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+    m0 = t0;
+    m1 = t1;
+    m2 = t2;
+    m3 = t3;
+
+    // Round 6
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
+    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
+    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
+    t1 = _mm_blend_epi16(tt, t1, 0xCC);
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = _mm_unpacklo_epi64(m3, m1);
+    tt = _mm_blend_epi16(t2, m2, 0xC0);
+    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
+    g1(row0, row1, row2, row3, t2);
+    t3 = _mm_unpackhi_epi32(m1, m3);
+    tt = _mm_unpacklo_epi32(m2, t3);
+    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+    m0 = t0;
+    m1 = t1;
+    m2 = t2;
+    m3 = t3;
+
+    // Round 7
+    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
+    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
+    g1(row0, row1, row2, row3, t0);
+    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
+    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
+    t1 = _mm_blend_epi16(tt, t1, 0xCC);
+    g2(row0, row1, row2, row3, t1);
+    diagonalize(row0, row2, row3);
+    t2 = _mm_unpacklo_epi64(m3, m1);
+    tt = _mm_blend_epi16(t2, m2, 0xC0);
+    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
+    g1(row0, row1, row2, row3, t2);
+    t3 = _mm_unpackhi_epi32(m1, m3);
+    tt = _mm_unpacklo_epi32(m2, t3);
+    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
+    g2(row0, row1, row2, row3, t3);
+    undiagonalize(row0, row2, row3);
+
+    [*row0, *row1, *row2, *row3]
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress_in_place(
+    cv: &mut CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) {
+    let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
+    storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
+    storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress_xof(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    counter: u64,
+    flags: u8,
+) -> [u8; 64] {
+    let [mut row0, mut row1, mut row2, mut row3] =
+        compress_pre(cv, block, block_len, counter, flags);
+    row0 = xor(row0, row2);
+    row1 = xor(row1, row3);
+    row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
+    row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
+    core::mem::transmute([row0, row1, row2, row3])
+}
+
+#[inline(always)]
+unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[15] = rot16(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot12(v[4]);
+    v[5] = rot12(v[5]);
+    v[6] = rot12(v[6]);
+    v[7] = rot12(v[7]);
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
+    v[0] = add(v[0], v[4]);
+    v[1] = add(v[1], v[5]);
+    v[2] = add(v[2], v[6]);
+    v[3] = add(v[3], v[7]);
+    v[12] = xor(v[12], v[0]);
+    v[13] = xor(v[13], v[1]);
+    v[14] = xor(v[14], v[2]);
+    v[15] = xor(v[15], v[3]);
+    v[12] = rot8(v[12]);
+    v[13] = rot8(v[13]);
+    v[14] = rot8(v[14]);
+    v[15] = rot8(v[15]);
+    v[8] = add(v[8], v[12]);
+    v[9] = add(v[9], v[13]);
+    v[10] = add(v[10], v[14]);
+    v[11] = add(v[11], v[15]);
+    v[4] = xor(v[4], v[8]);
+    v[5] = xor(v[5], v[9]);
+    v[6] = xor(v[6], v[10]);
+    v[7] = xor(v[7], v[11]);
+    v[4] = rot7(v[4]);
+    v[5] = rot7(v[5]);
+    v[6] = rot7(v[6]);
+    v[7] = rot7(v[7]);
+
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot16(v[15]);
+    v[12] = rot16(v[12]);
+    v[13] = rot16(v[13]);
+    v[14] = rot16(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot12(v[5]);
+    v[6] = rot12(v[6]);
+    v[7] = rot12(v[7]);
+    v[4] = rot12(v[4]);
+    v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
+    v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
+    v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
+    v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
+    v[0] = add(v[0], v[5]);
+    v[1] = add(v[1], v[6]);
+    v[2] = add(v[2], v[7]);
+    v[3] = add(v[3], v[4]);
+    v[15] = xor(v[15], v[0]);
+    v[12] = xor(v[12], v[1]);
+    v[13] = xor(v[13], v[2]);
+    v[14] = xor(v[14], v[3]);
+    v[15] = rot8(v[15]);
+    v[12] = rot8(v[12]);
+    v[13] = rot8(v[13]);
+    v[14] = rot8(v[14]);
+    v[10] = add(v[10], v[15]);
+    v[11] = add(v[11], v[12]);
+    v[8] = add(v[8], v[13]);
+    v[9] = add(v[9], v[14]);
+    v[5] = xor(v[5], v[10]);
+    v[6] = xor(v[6], v[11]);
+    v[7] = xor(v[7], v[8]);
+    v[4] = xor(v[4], v[9]);
+    v[5] = rot7(v[5]);
+    v[6] = rot7(v[6]);
+    v[7] = rot7(v[7]);
+    v[4] = rot7(v[4]);
+}
+
+#[inline(always)]
+unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
+    // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+    // 22/33. Note that this doesn't split the vector into two lanes, as the
+    // AVX2 counterparts do.
+    let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+    let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+    let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+    let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+
+    // Interleave 64-bit lanes.
+    let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
+    let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
+    let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
+    let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
+
+    vecs[0] = abcd_0;
+    vecs[1] = abcd_1;
+    vecs[2] = abcd_2;
+    vecs[3] = abcd_3;
+}
+
+#[inline(always)]
+unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
+    let mut vecs = [
+        loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
+        loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
+        loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
+        loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
+        loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
+        loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
+        loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
+        loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
+        loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
+        loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
+    ];
+    for i in 0..DEGREE {
+        _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
+    }
+    let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
+    transpose_vecs(squares.0);
+    transpose_vecs(squares.1);
+    transpose_vecs(squares.2);
+    transpose_vecs(squares.3);
+    vecs
+}
+
+#[inline(always)]
+unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
+    let mask = if increment_counter.yes() { !0 } else { 0 };
+    (
+        set4(
+            counter_low(counter + (mask & 0)),
+            counter_low(counter + (mask & 1)),
+            counter_low(counter + (mask & 2)),
+            counter_low(counter + (mask & 3)),
+        ),
+        set4(
+            counter_high(counter + (mask & 0)),
+            counter_high(counter + (mask & 1)),
+            counter_high(counter + (mask & 2)),
+            counter_high(counter + (mask & 3)),
+        ),
+    )
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn hash4(
+    inputs: &[*const u8; DEGREE],
+    blocks: usize,
+    key: &CVWords,
+    counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: &mut [u8; DEGREE * OUT_LEN],
+) {
+    let mut h_vecs = [
+        set1(key[0]),
+        set1(key[1]),
+        set1(key[2]),
+        set1(key[3]),
+        set1(key[4]),
+        set1(key[5]),
+        set1(key[6]),
+        set1(key[7]),
+    ];
+    let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
+    let mut block_flags = flags | flags_start;
+
+    for block in 0..blocks {
+        if block + 1 == blocks {
+            block_flags |= flags_end;
+        }
+        let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
+        let block_flags_vec = set1(block_flags as u32);
+        let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
+
+        // The transposed compression function. Note that inlining this
+        // manually here improves compile times by a lot, compared to factoring
+        // it out into its own function and making it #[inline(always)]. Just
+        // guessing, it might have something to do with loop unrolling.
+        let mut v = [
+            h_vecs[0],
+            h_vecs[1],
+            h_vecs[2],
+            h_vecs[3],
+            h_vecs[4],
+            h_vecs[5],
+            h_vecs[6],
+            h_vecs[7],
+            set1(IV[0]),
+            set1(IV[1]),
+            set1(IV[2]),
+            set1(IV[3]),
+            counter_low_vec,
+            counter_high_vec,
+            block_len_vec,
+            block_flags_vec,
+        ];
+        round(&mut v, &msg_vecs, 0);
+        round(&mut v, &msg_vecs, 1);
+        round(&mut v, &msg_vecs, 2);
+        round(&mut v, &msg_vecs, 3);
+        round(&mut v, &msg_vecs, 4);
+        round(&mut v, &msg_vecs, 5);
+        round(&mut v, &msg_vecs, 6);
+        h_vecs[0] = xor(v[0], v[8]);
+        h_vecs[1] = xor(v[1], v[9]);
+        h_vecs[2] = xor(v[2], v[10]);
+        h_vecs[3] = xor(v[3], v[11]);
+        h_vecs[4] = xor(v[4], v[12]);
+        h_vecs[5] = xor(v[5], v[13]);
+        h_vecs[6] = xor(v[6], v[14]);
+        h_vecs[7] = xor(v[7], v[15]);
+
+        block_flags = flags;
+    }
+
+    let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
+    transpose_vecs(squares.0);
+    transpose_vecs(squares.1);
+    // The first four vecs now contain the first half of each output, and the
+    // second four vecs contain the second half of each output.
+    storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
+    storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
+    storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
+    storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
+    storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
+    storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
+    storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
+    storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
+}
+
+#[target_feature(enable = "sse4.1")]
+unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
+    input: &A,
+    key: &CVWords,
+    counter: u64,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    out: &mut CVBytes,
+) {
+    debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
+    let mut cv = *key;
+    let mut block_flags = flags | flags_start;
+    let mut slice = input.as_slice();
+    while slice.len() >= BLOCK_LEN {
+        if slice.len() == BLOCK_LEN {
+            block_flags |= flags_end;
+        }
+        compress_in_place(
+            &mut cv,
+            array_ref!(slice, 0, BLOCK_LEN),
+            BLOCK_LEN as u8,
+            counter,
+            block_flags,
+        );
+        block_flags = flags;
+        slice = &slice[BLOCK_LEN..];
+    }
+    *out = core::mem::transmute(cv); // x86 is little-endian
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
+    mut inputs: &[&A],
+    key: &CVWords,
+    mut counter: u64,
+    increment_counter: IncrementCounter,
+    flags: u8,
+    flags_start: u8,
+    flags_end: u8,
+    mut out: &mut [u8],
+) {
+    debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
+    while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
+        // Safe because the layout of arrays is guaranteed, and because the
+        // `blocks` count is determined statically from the argument type.
+        let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
+        let blocks = A::CAPACITY / BLOCK_LEN;
+        hash4(
+            input_ptrs,
+            blocks,
+            key,
+            counter,
+            increment_counter,
+            flags,
+            flags_start,
+            flags_end,
+            array_mut_ref!(out, 0, DEGREE * OUT_LEN),
+        );
+        if increment_counter.yes() {
+            counter += DEGREE as u64;
+        }
+        inputs = &inputs[DEGREE..];
+        out = &mut out[DEGREE * OUT_LEN..];
+    }
+    for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
+        hash1(
+            input,
+            key,
+            counter,
+            flags,
+            flags_start,
+            flags_end,
+            array_mut_ref!(output, 0, OUT_LEN),
+        );
+        if increment_counter.yes() {
+            counter += 1;
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_transpose() {
+        if !crate::platform::sse41_detected() {
+            return;
+        }
+
+        #[target_feature(enable = "sse4.1")]
+        unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
+            transpose_vecs(vecs);
+        }
+
+        let mut matrix = [[0 as u32; DEGREE]; DEGREE];
+        for i in 0..DEGREE {
+            for j in 0..DEGREE {
+                matrix[i][j] = (i * DEGREE + j) as u32;
+            }
+        }
+
+        unsafe {
+            let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
+            transpose_wrapper(&mut vecs);
+            matrix = core::mem::transmute(vecs);
+        }
+
+        for i in 0..DEGREE {
+            for j in 0..DEGREE {
+                // Reversed indexes from above.
+                assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
+            }
+        }
+    }
+
+    #[test]
+    fn test_compress() {
+        if !crate::platform::sse41_detected() {
+            return;
+        }
+        crate::test::test_compress_fn(compress_in_place, compress_xof);
+    }
+
+    #[test]
+    fn test_hash_many() {
+        if !crate::platform::sse41_detected() {
+            return;
+        }
+        crate::test::test_hash_many_fn(hash_many, hash_many);
+    }
+}
diff --git a/src/sse41.rs b/src/sse41.rs
deleted file mode 100644
index fcf2f98..0000000
--- a/src/sse41.rs
+++ /dev/null
@@ -1,766 +0,0 @@
-#[cfg(target_arch = "x86")]
-use core::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use core::arch::x86_64::*;
-
-use crate::{
-    counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
-    OUT_LEN,
-};
-use arrayref::{array_mut_ref, array_ref, mut_array_refs};
-
-pub const DEGREE: usize = 4;
-
-#[inline(always)]
-unsafe fn loadu(src: *const u8) -> __m128i {
-    // This is an unaligned load, so the pointer cast is allowed.
-    _mm_loadu_si128(src as *const __m128i)
-}
-
-#[inline(always)]
-unsafe fn storeu(src: __m128i, dest: *mut u8) {
-    // This is an unaligned store, so the pointer cast is allowed.
-    _mm_storeu_si128(dest as *mut __m128i, src)
-}
-
-#[inline(always)]
-unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
-    _mm_add_epi32(a, b)
-}
-
-#[inline(always)]
-unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
-    _mm_xor_si128(a, b)
-}
-
-#[inline(always)]
-unsafe fn set1(x: u32) -> __m128i {
-    _mm_set1_epi32(x as i32)
-}
-
-#[inline(always)]
-unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
-    _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
-}
-
-// These rotations are the "simple/shifts version". For the
-// "complicated/shuffles version", see
-// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
-// For a discussion of the tradeoffs, see
-// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
-// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
-// on recent x86 chips.
-
-#[inline(always)]
-unsafe fn rot16(a: __m128i) -> __m128i {
-    _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
-}
-
-#[inline(always)]
-unsafe fn rot12(a: __m128i) -> __m128i {
-    _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
-}
-
-#[inline(always)]
-unsafe fn rot8(a: __m128i) -> __m128i {
-    _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
-}
-
-#[inline(always)]
-unsafe fn rot7(a: __m128i) -> __m128i {
-    _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
-}
-
-#[inline(always)]
-unsafe fn g1(
-    row0: &mut __m128i,
-    row1: &mut __m128i,
-    row2: &mut __m128i,
-    row3: &mut __m128i,
-    m: __m128i,
-) {
-    *row0 = add(add(*row0, m), *row1);
-    *row3 = xor(*row3, *row0);
-    *row3 = rot16(*row3);
-    *row2 = add(*row2, *row3);
-    *row1 = xor(*row1, *row2);
-    *row1 = rot12(*row1);
-}
-
-#[inline(always)]
-unsafe fn g2(
-    row0: &mut __m128i,
-    row1: &mut __m128i,
-    row2: &mut __m128i,
-    row3: &mut __m128i,
-    m: __m128i,
-) {
-    *row0 = add(add(*row0, m), *row1);
-    *row3 = xor(*row3, *row0);
-    *row3 = rot8(*row3);
-    *row2 = add(*row2, *row3);
-    *row1 = xor(*row1, *row2);
-    *row1 = rot7(*row1);
-}
-
-// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
-macro_rules! _MM_SHUFFLE {
-    ($z:expr, $y:expr, $x:expr, $w:expr) => {
-        ($z << 6) | ($y << 4) | ($x << 2) | $w
-    };
-}
-
-macro_rules! shuffle2 {
-    ($a:expr, $b:expr, $c:expr) => {
-        _mm_castps_si128(_mm_shuffle_ps(
-            _mm_castsi128_ps($a),
-            _mm_castsi128_ps($b),
-            $c,
-        ))
-    };
-}
-
-// Note the optimization here of leaving row1 as the unrotated row, rather than
-// row0. All the message loads below are adjusted to compensate for this. See
-// discussion at https://github.com/sneves/blake2-avx2/pull/4
-#[inline(always)]
-unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
-    *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
-    *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
-    *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
-}
-
-#[inline(always)]
-unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
-    *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
-    *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
-    *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
-}
-
-#[inline(always)]
-unsafe fn compress_pre(
-    cv: &CVWords,
-    block: &[u8; BLOCK_LEN],
-    block_len: u8,
-    counter: u64,
-    flags: u8,
-) -> [__m128i; 4] {
-    let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
-    let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
-    let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
-    let row3 = &mut set4(
-        counter_low(counter),
-        counter_high(counter),
-        block_len as u32,
-        flags as u32,
-    );
-
-    let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
-    let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
-    let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
-    let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
-
-    let mut t0;
-    let mut t1;
-    let mut t2;
-    let mut t3;
-    let mut tt;
-
-    // Round 1. The first round permutes the message words from the original
-    // input order, into the groups that get mixed in parallel.
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); //  6  4  2  0
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); //  7  5  3  1
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10  8
-    t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10  8 14
-    g1(row0, row1, row2, row3, t2);
-    t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11  9
-    t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11  9 15
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-    m0 = t0;
-    m1 = t1;
-    m2 = t2;
-    m3 = t3;
-
-    // Round 2. This round and all following rounds apply a fixed permutation
-    // to the message words from the round before.
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
-    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
-    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
-    t1 = _mm_blend_epi16(tt, t1, 0xCC);
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = _mm_unpacklo_epi64(m3, m1);
-    tt = _mm_blend_epi16(t2, m2, 0xC0);
-    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
-    g1(row0, row1, row2, row3, t2);
-    t3 = _mm_unpackhi_epi32(m1, m3);
-    tt = _mm_unpacklo_epi32(m2, t3);
-    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-    m0 = t0;
-    m1 = t1;
-    m2 = t2;
-    m3 = t3;
-
-    // Round 3
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
-    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
-    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
-    t1 = _mm_blend_epi16(tt, t1, 0xCC);
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = _mm_unpacklo_epi64(m3, m1);
-    tt = _mm_blend_epi16(t2, m2, 0xC0);
-    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
-    g1(row0, row1, row2, row3, t2);
-    t3 = _mm_unpackhi_epi32(m1, m3);
-    tt = _mm_unpacklo_epi32(m2, t3);
-    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-    m0 = t0;
-    m1 = t1;
-    m2 = t2;
-    m3 = t3;
-
-    // Round 4
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
-    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
-    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
-    t1 = _mm_blend_epi16(tt, t1, 0xCC);
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = _mm_unpacklo_epi64(m3, m1);
-    tt = _mm_blend_epi16(t2, m2, 0xC0);
-    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
-    g1(row0, row1, row2, row3, t2);
-    t3 = _mm_unpackhi_epi32(m1, m3);
-    tt = _mm_unpacklo_epi32(m2, t3);
-    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-    m0 = t0;
-    m1 = t1;
-    m2 = t2;
-    m3 = t3;
-
-    // Round 5
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
-    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
-    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
-    t1 = _mm_blend_epi16(tt, t1, 0xCC);
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = _mm_unpacklo_epi64(m3, m1);
-    tt = _mm_blend_epi16(t2, m2, 0xC0);
-    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
-    g1(row0, row1, row2, row3, t2);
-    t3 = _mm_unpackhi_epi32(m1, m3);
-    tt = _mm_unpacklo_epi32(m2, t3);
-    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-    m0 = t0;
-    m1 = t1;
-    m2 = t2;
-    m3 = t3;
-
-    // Round 6
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
-    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
-    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
-    t1 = _mm_blend_epi16(tt, t1, 0xCC);
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = _mm_unpacklo_epi64(m3, m1);
-    tt = _mm_blend_epi16(t2, m2, 0xC0);
-    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
-    g1(row0, row1, row2, row3, t2);
-    t3 = _mm_unpackhi_epi32(m1, m3);
-    tt = _mm_unpacklo_epi32(m2, t3);
-    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-    m0 = t0;
-    m1 = t1;
-    m2 = t2;
-    m3 = t3;
-
-    // Round 7
-    t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
-    t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
-    g1(row0, row1, row2, row3, t0);
-    t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
-    tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
-    t1 = _mm_blend_epi16(tt, t1, 0xCC);
-    g2(row0, row1, row2, row3, t1);
-    diagonalize(row0, row2, row3);
-    t2 = _mm_unpacklo_epi64(m3, m1);
-    tt = _mm_blend_epi16(t2, m2, 0xC0);
-    t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
-    g1(row0, row1, row2, row3, t2);
-    t3 = _mm_unpackhi_epi32(m1, m3);
-    tt = _mm_unpacklo_epi32(m2, t3);
-    t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
-    g2(row0, row1, row2, row3, t3);
-    undiagonalize(row0, row2, row3);
-
-    [*row0, *row1, *row2, *row3]
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn compress_in_place(
-    cv: &mut CVWords,
-    block: &[u8; BLOCK_LEN],
-    block_len: u8,
-    counter: u64,
-    flags: u8,
-) {
-    let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
-    storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
-    storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn compress_xof(
-    cv: &CVWords,
-    block: &[u8; BLOCK_LEN],
-    block_len: u8,
-    counter: u64,
-    flags: u8,
-) -> [u8; 64] {
-    let [mut row0, mut row1, mut row2, mut row3] =
-        compress_pre(cv, block, block_len, counter, flags);
-    row0 = xor(row0, row2);
-    row1 = xor(row1, row3);
-    row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
-    row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
-    core::mem::transmute([row0, row1, row2, row3])
-}
-
-#[inline(always)]
-unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
-    v[0] = add(v[0], v[4]);
-    v[1] = add(v[1], v[5]);
-    v[2] = add(v[2], v[6]);
-    v[3] = add(v[3], v[7]);
-    v[12] = xor(v[12], v[0]);
-    v[13] = xor(v[13], v[1]);
-    v[14] = xor(v[14], v[2]);
-    v[15] = xor(v[15], v[3]);
-    v[12] = rot16(v[12]);
-    v[13] = rot16(v[13]);
-    v[14] = rot16(v[14]);
-    v[15] = rot16(v[15]);
-    v[8] = add(v[8], v[12]);
-    v[9] = add(v[9], v[13]);
-    v[10] = add(v[10], v[14]);
-    v[11] = add(v[11], v[15]);
-    v[4] = xor(v[4], v[8]);
-    v[5] = xor(v[5], v[9]);
-    v[6] = xor(v[6], v[10]);
-    v[7] = xor(v[7], v[11]);
-    v[4] = rot12(v[4]);
-    v[5] = rot12(v[5]);
-    v[6] = rot12(v[6]);
-    v[7] = rot12(v[7]);
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
-    v[0] = add(v[0], v[4]);
-    v[1] = add(v[1], v[5]);
-    v[2] = add(v[2], v[6]);
-    v[3] = add(v[3], v[7]);
-    v[12] = xor(v[12], v[0]);
-    v[13] = xor(v[13], v[1]);
-    v[14] = xor(v[14], v[2]);
-    v[15] = xor(v[15], v[3]);
-    v[12] = rot8(v[12]);
-    v[13] = rot8(v[13]);
-    v[14] = rot8(v[14]);
-    v[15] = rot8(v[15]);
-    v[8] = add(v[8], v[12]);
-    v[9] = add(v[9], v[13]);
-    v[10] = add(v[10], v[14]);
-    v[11] = add(v[11], v[15]);
-    v[4] = xor(v[4], v[8]);
-    v[5] = xor(v[5], v[9]);
-    v[6] = xor(v[6], v[10]);
-    v[7] = xor(v[7], v[11]);
-    v[4] = rot7(v[4]);
-    v[5] = rot7(v[5]);
-    v[6] = rot7(v[6]);
-    v[7] = rot7(v[7]);
-
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
-    v[0] = add(v[0], v[5]);
-    v[1] = add(v[1], v[6]);
-    v[2] = add(v[2], v[7]);
-    v[3] = add(v[3], v[4]);
-    v[15] = xor(v[15], v[0]);
-    v[12] = xor(v[12], v[1]);
-    v[13] = xor(v[13], v[2]);
-    v[14] = xor(v[14], v[3]);
-    v[15] = rot16(v[15]);
-    v[12] = rot16(v[12]);
-    v[13] = rot16(v[13]);
-    v[14] = rot16(v[14]);
-    v[10] = add(v[10], v[15]);
-    v[11] = add(v[11], v[12]);
-    v[8] = add(v[8], v[13]);
-    v[9] = add(v[9], v[14]);
-    v[5] = xor(v[5], v[10]);
-    v[6] = xor(v[6], v[11]);
-    v[7] = xor(v[7], v[8]);
-    v[4] = xor(v[4], v[9]);
-    v[5] = rot12(v[5]);
-    v[6] = rot12(v[6]);
-    v[7] = rot12(v[7]);
-    v[4] = rot12(v[4]);
-    v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
-    v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
-    v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
-    v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
-    v[0] = add(v[0], v[5]);
-    v[1] = add(v[1], v[6]);
-    v[2] = add(v[2], v[7]);
-    v[3] = add(v[3], v[4]);
-    v[15] = xor(v[15], v[0]);
-    v[12] = xor(v[12], v[1]);
-    v[13] = xor(v[13], v[2]);
-    v[14] = xor(v[14], v[3]);
-    v[15] = rot8(v[15]);
-    v[12] = rot8(v[12]);
-    v[13] = rot8(v[13]);
-    v[14] = rot8(v[14]);
-    v[10] = add(v[10], v[15]);
-    v[11] = add(v[11], v[12]);
-    v[8] = add(v[8], v[13]);
-    v[9] = add(v[9], v[14]);
-    v[5] = xor(v[5], v[10]);
-    v[6] = xor(v[6], v[11]);
-    v[7] = xor(v[7], v[8]);
-    v[4] = xor(v[4], v[9]);
-    v[5] = rot7(v[5]);
-    v[6] = rot7(v[6]);
-    v[7] = rot7(v[7]);
-    v[4] = rot7(v[4]);
-}
-
-#[inline(always)]
-unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
-    // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
-    // 22/33. Note that this doesn't split the vector into two lanes, as the
-    // AVX2 counterparts do.
-    let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
-    let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
-    let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
-    let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-
-    // Interleave 64-bit lanes.
-    let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
-    let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
-    let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
-    let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
-
-    vecs[0] = abcd_0;
-    vecs[1] = abcd_1;
-    vecs[2] = abcd_2;
-    vecs[3] = abcd_3;
-}
-
-#[inline(always)]
-unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
-    let mut vecs = [
-        loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
-        loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
-        loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
-        loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
-        loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
-        loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
-        loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
-        loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
-        loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
-        loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
-    ];
-    for i in 0..DEGREE {
-        _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0);
-    }
-    let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
-    transpose_vecs(squares.0);
-    transpose_vecs(squares.1);
-    transpose_vecs(squares.2);
-    transpose_vecs(squares.3);
-    vecs
-}
-
-#[inline(always)]
-unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
-    let mask = if increment_counter.yes() { !0 } else { 0 };
-    (
-        set4(
-            counter_low(counter + (mask & 0)),
-            counter_low(counter + (mask & 1)),
-            counter_low(counter + (mask & 2)),
-            counter_low(counter + (mask & 3)),
-        ),
-        set4(
-            counter_high(counter + (mask & 0)),
-            counter_high(counter + (mask & 1)),
-            counter_high(counter + (mask & 2)),
-            counter_high(counter + (mask & 3)),
-        ),
-    )
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn hash4(
-    inputs: &[*const u8; DEGREE],
-    blocks: usize,
-    key: &CVWords,
-    counter: u64,
-    increment_counter: IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    out: &mut [u8; DEGREE * OUT_LEN],
-) {
-    let mut h_vecs = [
-        set1(key[0]),
-        set1(key[1]),
-        set1(key[2]),
-        set1(key[3]),
-        set1(key[4]),
-        set1(key[5]),
-        set1(key[6]),
-        set1(key[7]),
-    ];
-    let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
-    let mut block_flags = flags | flags_start;
-
-    for block in 0..blocks {
-        if block + 1 == blocks {
-            block_flags |= flags_end;
-        }
-        let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
-        let block_flags_vec = set1(block_flags as u32);
-        let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
-
-        // The transposed compression function. Note that inlining this
-        // manually here improves compile times by a lot, compared to factoring
-        // it out into its own function and making it #[inline(always)]. Just
-        // guessing, it might have something to do with loop unrolling.
-        let mut v = [
-            h_vecs[0],
-            h_vecs[1],
-            h_vecs[2],
-            h_vecs[3],
-            h_vecs[4],
-            h_vecs[5],
-            h_vecs[6],
-            h_vecs[7],
-            set1(IV[0]),
-            set1(IV[1]),
-            set1(IV[2]),
-            set1(IV[3]),
-            counter_low_vec,
-            counter_high_vec,
-            block_len_vec,
-            block_flags_vec,
-        ];
-        round(&mut v, &msg_vecs, 0);
-        round(&mut v, &msg_vecs, 1);
-        round(&mut v, &msg_vecs, 2);
-        round(&mut v, &msg_vecs, 3);
-        round(&mut v, &msg_vecs, 4);
-        round(&mut v, &msg_vecs, 5);
-        round(&mut v, &msg_vecs, 6);
-        h_vecs[0] = xor(v[0], v[8]);
-        h_vecs[1] = xor(v[1], v[9]);
-        h_vecs[2] = xor(v[2], v[10]);
-        h_vecs[3] = xor(v[3], v[11]);
-        h_vecs[4] = xor(v[4], v[12]);
-        h_vecs[5] = xor(v[5], v[13]);
-        h_vecs[6] = xor(v[6], v[14]);
-        h_vecs[7] = xor(v[7], v[15]);
-
-        block_flags = flags;
-    }
-
-    let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
-    transpose_vecs(squares.0);
-    transpose_vecs(squares.1);
-    // The first four vecs now contain the first half of each output, and the
-    // second four vecs contain the second half of each output.
-    storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
-    storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
-    storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
-    storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
-    storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
-    storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
-    storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
-    storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
-}
-
-#[target_feature(enable = "sse4.1")]
-unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
-    input: &A,
-    key: &CVWords,
-    counter: u64,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    out: &mut CVBytes,
-) {
-    debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
-    let mut cv = *key;
-    let mut block_flags = flags | flags_start;
-    let mut slice = input.as_slice();
-    while slice.len() >= BLOCK_LEN {
-        if slice.len() == BLOCK_LEN {
-            block_flags |= flags_end;
-        }
-        compress_in_place(
-            &mut cv,
-            array_ref!(slice, 0, BLOCK_LEN),
-            BLOCK_LEN as u8,
-            counter,
-            block_flags,
-        );
-        block_flags = flags;
-        slice = &slice[BLOCK_LEN..];
-    }
-    *out = core::mem::transmute(cv); // x86 is little-endian
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
-    mut inputs: &[&A],
-    key: &CVWords,
-    mut counter: u64,
-    increment_counter: IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    mut out: &mut [u8],
-) {
-    debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
-    while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
-        // Safe because the layout of arrays is guaranteed, and because the
-        // `blocks` count is determined statically from the argument type.
-        let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
-        let blocks = A::CAPACITY / BLOCK_LEN;
-        hash4(
-            input_ptrs,
-            blocks,
-            key,
-            counter,
-            increment_counter,
-            flags,
-            flags_start,
-            flags_end,
-            array_mut_ref!(out, 0, DEGREE * OUT_LEN),
-        );
-        if increment_counter.yes() {
-            counter += DEGREE as u64;
-        }
-        inputs = &inputs[DEGREE..];
-        out = &mut out[DEGREE * OUT_LEN..];
-    }
-    for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
-        hash1(
-            input,
-            key,
-            counter,
-            flags,
-            flags_start,
-            flags_end,
-            array_mut_ref!(output, 0, OUT_LEN),
-        );
-        if increment_counter.yes() {
-            counter += 1;
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn test_transpose() {
-        if !crate::platform::sse41_detected() {
-            return;
-        }
-
-        #[target_feature(enable = "sse4.1")]
-        unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
-            transpose_vecs(vecs);
-        }
-
-        let mut matrix = [[0 as u32; DEGREE]; DEGREE];
-        for i in 0..DEGREE {
-            for j in 0..DEGREE {
-                matrix[i][j] = (i * DEGREE + j) as u32;
-            }
-        }
-
-        unsafe {
-            let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
-            transpose_wrapper(&mut vecs);
-            matrix = core::mem::transmute(vecs);
-        }
-
-        for i in 0..DEGREE {
-            for j in 0..DEGREE {
-                // Reversed indexes from above.
-                assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
-            }
-        }
-    }
-
-    #[test]
-    fn test_compress() {
-        if !crate::platform::sse41_detected() {
-            return;
-        }
-        crate::test::test_compress_fn(compress_in_place, compress_xof);
-    }
-
-    #[test]
-    fn test_hash_many() {
-        if !crate::platform::sse41_detected() {
-            return;
-        }
-        crate::test::test_hash_many_fn(hash_many, hash_many);
-    }
-}
diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml
index 007d1c8..2a90e39 100644
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@@ -3,10 +3,16 @@ name = "test_vectors"
 version = "0.0.0"
 edition = "2018"
 
+[features]
+default = []
+c = ["blake3/c"]
+c_prefer_intrinsics = ["blake3/c_prefer_intrinsics"]
+c_neon = ["blake3/c_neon"]
+
 [dependencies]
 # If you ever change these path dependencies, you'll probably need to update
 # cross_test.sh, or CI will break. I'm sorry >.<
-blake3 = { path = "../", features=["c_avx512"] }
+blake3 = { path = "../" }
 hex = "0.4.0"
 reference_impl = { path = "../reference_impl" }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/test_vectors/cross_test.sh b/test_vectors/cross_test.sh
index 1f6a34b..c4d280c 100755
--- a/test_vectors/cross_test.sh
+++ b/test_vectors/cross_test.sh
@@ -19,7 +19,7 @@ mv blake3/test_vectors .
 mv blake3/reference_impl test_vectors
 mv blake3 test_vectors
 cd test_vectors
-sed -i 's|blake3 = { path = "../", features=\["c_avx512"\] }|blake3 = { path = "./blake3" }|' Cargo.toml
+sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml
 sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml
 
 cross test "$@"
-- 
cgit v1.2.3