switch to representing CVs as words for the compression function

The portable implementation was getting slowed down by converting back and forth between words and bytes. I made the corresponding change on the C side first (https://github.com/veorq/BLAKE3-c/commit/12a37be8b50922a358c016ba07f46816a3da4a31), and as part of this commit I'm re-vendoring the C code. I'm also exposing a small FFI interface to C so that blake3_neon.c can link against portable.rs rather than blake3_portable.c, see c_neon.rs.
author: Jack O'Connor <[email protected]> 2019-12-10 14:20:09 -0500
committer: Jack O'Connor <[email protected]> 2019-12-11 18:05:26 -0500
commit: 52ea6487f88a0e5cbc2f784f3095539afe6c91e4 (patch)
tree: 181508c1840c2961e530e982c4525029d79e5685 /src
parent: d68882da0d897c93a271a7c0f6d6b9b13d13aa16 (diff)
13 files changed, 549 insertions, 542 deletions
diff --git a/src/avx2.rs b/src/avx2.rs
index 14673c6..471a2dc 100644
--- a/src/avx2.rs
+++ b/src/avx2.rs
@@ -3,7 +3,7 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{offset_high, offset_low, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN};
 use arrayref::{array_mut_ref, mut_array_refs};
 
 pub const DEGREE: usize = 8;
@@ -299,7 +299,7 @@ unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m256i, _
 pub unsafe fn hash8(
     inputs: &[*const u8; DEGREE],
     blocks: usize,
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -307,16 +307,15 @@ pub unsafe fn hash8(
     flags_end: u8,
     out: &mut [u8; DEGREE * OUT_LEN],
 ) {
-    let key_words: [u32; 8] = core::mem::transmute(*key); // x86 is little-endian
     let mut h_vecs = [
-        set1(key_words[0]),
-        set1(key_words[1]),
-        set1(key_words[2]),
-        set1(key_words[3]),
-        set1(key_words[4]),
-        set1(key_words[5]),
-        set1(key_words[6]),
-        set1(key_words[7]),
+        set1(key[0]),
+        set1(key[1]),
+        set1(key[2]),
+        set1(key[3]),
+        set1(key[4]),
+        set1(key[5]),
+        set1(key[6]),
+        set1(key[7]),
     ];
     let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
     let mut block_flags = flags | flags_start;
@@ -384,7 +383,7 @@ pub unsafe fn hash8(
 #[target_feature(enable = "avx2")]
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     mut inputs: &[&A],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     mut offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
diff --git a/src/c/blake3.h b/src/c/blake3.h
index 8af3e04..f6e75d8 100644
--- a/src/c/blake3.h
+++ b/src/c/blake3.h
@@ -10,8 +10,7 @@
 #define BLAKE3_MAX_SIMD_DEGREE 16
 
 typedef struct {
-  uint8_t cv[32];
-  uint8_t key[BLAKE3_KEY_LEN];
+  uint32_t cv[8];
   uint64_t offset;
   uint16_t count;
   uint8_t buf[BLAKE3_BLOCK_LEN];
@@ -20,9 +19,10 @@ typedef struct {
 } blake3_chunk_state;
 
 typedef struct {
+  uint32_t key[8];
   blake3_chunk_state chunk;
-  uint8_t subtree_hashes_len;
-  uint8_t subtree_hashes[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN];
+  uint8_t cv_stack_len;
+  uint8_t cv_stack[BLAKE3_MAX_DEPTH * BLAKE3_OUT_LEN];
 } blake3_hasher;
 
 void blake3_hasher_init(blake3_hasher *self);
@@ -32,5 +32,5 @@ void blake3_hasher_init_derive_key(blake3_hasher *self,
                                    const uint8_t key[BLAKE3_KEY_LEN]);
 void blake3_hasher_update(blake3_hasher *self, const void *input,
                           size_t input_len);
-void blake3_hasher_finalize(const blake3_hasher *self,
-                            uint8_t out[BLAKE3_OUT_LEN]);
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len);
diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c
index 7b0dce1..f30e302 100644
--- a/src/c/blake3_avx512.c
+++ b/src/c/blake3_avx512.c
@@ -74,50 +74,49 @@ INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
  * ----------------------------------------------------------------------------
  */
 
-INLINE void g1(__m128i *row1, __m128i *row2, __m128i *row3, __m128i *row4,
+INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
                __m128i m) {
-  *row1 = add_128(add_128(*row1, m), *row2);
-  *row4 = xor_128(*row4, *row1);
-  *row4 = rot16_128(*row4);
-  *row3 = add_128(*row3, *row4);
-  *row2 = xor_128(*row2, *row3);
-  *row2 = rot12_128(*row2);
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot16_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot12_128(*row1);
 }
 
-INLINE void g2(__m128i *row1, __m128i *row2, __m128i *row3, __m128i *row4,
+INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
                __m128i m) {
-  *row1 = add_128(add_128(*row1, m), *row2);
-  *row4 = xor_128(*row4, *row1);
-  *row4 = rot8_128(*row4);
-  *row3 = add_128(*row3, *row4);
-  *row2 = xor_128(*row2, *row3);
-  *row2 = rot7_128(*row2);
+  *row0 = add_128(add_128(*row0, m), *row1);
+  *row3 = xor_128(*row3, *row0);
+  *row3 = rot8_128(*row3);
+  *row2 = add_128(*row2, *row3);
+  *row1 = xor_128(*row1, *row2);
+  *row1 = rot7_128(*row1);
 }
 
-// Note the optimization here of leaving row2 as the unrotated row, rather than
-// row1. All the message loads below are adjusted to compensate for this. See
+// Note the optimization here of leaving row1 as the unrotated row, rather than
+// row0. All the message loads below are adjusted to compensate for this. See
 // discussion at https://github.com/sneves/blake2-avx2/pull/4
-INLINE void diagonalize(__m128i *row1, __m128i *row3, __m128i *row4) {
-  *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE(2, 1, 0, 3));
-  *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE(1, 0, 3, 2));
-  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(0, 3, 2, 1));
+INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
 }
 
-INLINE void undiagonalize(__m128i *row1, __m128i *row3, __m128i *row4) {
-  *row1 = _mm_shuffle_epi32(*row1, _MM_SHUFFLE(0, 3, 2, 1));
-  *row4 = _mm_shuffle_epi32(*row4, _MM_SHUFFLE(1, 0, 3, 2));
-  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(2, 1, 0, 3));
+INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
+  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
+  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
+  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
 }
 
-void blake3_compress_avx512(const uint8_t cv[8],
-                            const uint8_t block[BLAKE3_BLOCK_LEN],
-                            uint8_t block_len, uint64_t offset, uint8_t flags,
-                            uint8_t out[64]) {
-  __m128i row1 = loadu_128(&cv[0]);
-  __m128i row2 = loadu_128(&cv[16]);
-  __m128i row3 = set4(IV[0], IV[1], IV[2], IV[3]);
-  __m128i row4 = set4(offset_low(offset), offset_high(offset),
-                      (uint32_t)block_len, (uint32_t)flags);
+INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
+                         const uint8_t block[BLAKE3_BLOCK_LEN],
+                         uint8_t block_len, uint64_t offset, uint8_t flags) {
+  rows[0] = loadu_128((uint8_t *)&cv[0]);
+  rows[1] = loadu_128((uint8_t *)&cv[4]);
+  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
+  rows[3] = set4(offset_low(offset), offset_high(offset), (uint32_t)block_len,
+                 (uint32_t)flags);
 
   __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
   __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
@@ -129,160 +128,177 @@ void blake3_compress_avx512(const uint8_t cv[8],
   // round 1
   buf = _mm_castps_si128(_mm_shuffle_ps(
       _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0)));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   buf = _mm_castps_si128(_mm_shuffle_ps(
       _mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(3, 1, 3, 1)));
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(3, 2, 0, 1));
   t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(0, 1, 3, 2));
   buf = _mm_blend_epi16(t0, t1, 0xC3);
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_blend_epi16(t0, t1, 0x3C);
   buf = _mm_shuffle_epi32(t0, _MM_SHUFFLE(2, 3, 0, 1));
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
 
   // round 2
   t0 = _mm_blend_epi16(m1, m2, 0x0C);
   t1 = _mm_slli_si128(m3, 4);
   t2 = _mm_blend_epi16(t0, t1, 0xF0);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_shuffle_epi32(m2, _MM_SHUFFLE(0, 0, 2, 0));
   t1 = _mm_blend_epi16(m1, m3, 0xC0);
   t2 = _mm_blend_epi16(t0, t1, 0xF0);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1));
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_slli_si128(m1, 4);
   t1 = _mm_blend_epi16(m2, t0, 0x30);
   t2 = _mm_blend_epi16(m0, t1, 0xF0);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_unpackhi_epi32(m0, m1);
   t1 = _mm_slli_si128(m3, 4);
   t2 = _mm_blend_epi16(t0, t1, 0x0C);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 0, 1, 2));
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
 
   // round 3
   t0 = _mm_unpackhi_epi32(m2, m3);
   t1 = _mm_blend_epi16(m3, m1, 0x0C);
   t2 = _mm_blend_epi16(t0, t1, 0x0F);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_unpacklo_epi32(m2, m0);
   t1 = _mm_blend_epi16(t0, m0, 0xF0);
   t2 = _mm_slli_si128(m3, 8);
   buf = _mm_blend_epi16(t1, t2, 0xC0);
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_blend_epi16(m0, m2, 0x3C);
   t1 = _mm_srli_si128(m1, 12);
   t2 = _mm_blend_epi16(t0, t1, 0x03);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0, 3, 2, 1));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_slli_si128(m3, 4);
   t1 = _mm_blend_epi16(m0, m1, 0x33);
   t2 = _mm_blend_epi16(t1, t0, 0xC0);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1, 2, 3, 0));
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
 
   // round 4
   t0 = _mm_unpackhi_epi32(m0, m1);
   t1 = _mm_unpackhi_epi32(t0, m2);
   t2 = _mm_blend_epi16(t1, m3, 0x0C);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 0, 2));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_slli_si128(m2, 8);
   t1 = _mm_blend_epi16(m3, m0, 0x0C);
   t2 = _mm_blend_epi16(t1, t0, 0xC0);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3));
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_blend_epi16(m0, m1, 0x0F);
   t1 = _mm_blend_epi16(t0, m3, 0xC0);
   buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_alignr_epi8(m0, m1, 4);
   buf = _mm_blend_epi16(t0, m2, 0x33);
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
 
   // round 5
   t0 = _mm_unpacklo_epi64(m1, m2);
   t1 = _mm_unpackhi_epi64(m0, m2);
   t2 = _mm_blend_epi16(t0, t1, 0x33);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 1, 3));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_unpackhi_epi64(m1, m3);
   t1 = _mm_unpacklo_epi64(m0, m1);
   buf = _mm_blend_epi16(t0, t1, 0x33);
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_unpackhi_epi64(m3, m1);
   t1 = _mm_unpackhi_epi64(m2, m0);
   t2 = _mm_blend_epi16(t1, t0, 0x33);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_blend_epi16(m0, m2, 0x03);
   t1 = _mm_slli_si128(t0, 8);
   t2 = _mm_blend_epi16(t1, m3, 0x0F);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 0, 3, 1));
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
 
   // round 6
   t0 = _mm_unpackhi_epi32(m0, m1);
   t1 = _mm_unpacklo_epi32(m0, m2);
   buf = _mm_unpacklo_epi64(t0, t1);
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_srli_si128(m2, 4);
   t1 = _mm_blend_epi16(m0, m3, 0x03);
   buf = _mm_blend_epi16(t1, t0, 0x3C);
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_blend_epi16(m1, m0, 0x0C);
   t1 = _mm_srli_si128(m3, 4);
   t2 = _mm_blend_epi16(t0, t1, 0x30);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 3, 0, 1));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_unpacklo_epi64(m2, m1);
   t1 = _mm_shuffle_epi32(m3, _MM_SHUFFLE(2, 0, 1, 0));
   t2 = _mm_srli_si128(t0, 4);
   buf = _mm_blend_epi16(t1, t2, 0x33);
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
 
   // round 7
   t0 = _mm_slli_si128(m1, 12);
   t1 = _mm_blend_epi16(m0, m3, 0x33);
   buf = _mm_blend_epi16(t1, t0, 0xC0);
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_blend_epi16(m3, m2, 0x30);
   t1 = _mm_srli_si128(m1, 4);
   t2 = _mm_blend_epi16(t0, t1, 0x03);
   buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 3, 0));
-  g2(&row1, &row2, &row3, &row4, buf);
-  diagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  diagonalize(&rows[0], &rows[2], &rows[3]);
   t0 = _mm_unpacklo_epi64(m0, m2);
   t1 = _mm_srli_si128(m1, 4);
   buf =
       _mm_shuffle_epi32(_mm_blend_epi16(t0, t1, 0x0C), _MM_SHUFFLE(3, 1, 0, 2));
-  g1(&row1, &row2, &row3, &row4, buf);
+  g1(&rows[0], &rows[1], &rows[2], &rows[3], buf);
   t0 = _mm_unpackhi_epi32(m1, m2);
   t1 = _mm_unpackhi_epi64(m0, t0);
   buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(0, 1, 2, 3));
-  g2(&row1, &row2, &row3, &row4, buf);
-  undiagonalize(&row1, &row3, &row4);
+  g2(&rows[0], &rows[1], &rows[2], &rows[3], buf);
+  undiagonalize(&rows[0], &rows[2], &rows[3]);
+}
+
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t offset,
+                                uint8_t flags, uint8_t out[64]) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, offset, flags);
+  storeu_128(xor_128(rows[0], rows[2]), &out[0]);
+  storeu_128(xor_128(rows[1], rows[3]), &out[16]);
+  storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
+  storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
+}
 
-  storeu_128(xor_128(row1, row3), &out[0]);
-  storeu_128(xor_128(row2, row4), &out[16]);
-  storeu_128(xor_128(row3, loadu_128(&cv[0])), &out[32]);
-  storeu_128(xor_128(row4, loadu_128(&cv[16])), &out[48]);
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t offset,
+                                     uint8_t flags) {
+  __m128i rows[4];
+  compress_pre(rows, cv, block, block_len, offset, flags);
+  storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
+  storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
 }
 
 /*
@@ -461,15 +477,12 @@ INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
 }
 
 void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
-                         const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+                         const uint32_t key[8], uint64_t offset,
                          offset_deltas_t offset_deltas, uint8_t flags,
                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
-  uint32_t key_words[8];
-  memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian
   __m128i h_vecs[8] = {
-      set1_128(key_words[0]), set1_128(key_words[1]), set1_128(key_words[2]),
-      set1_128(key_words[3]), set1_128(key_words[4]), set1_128(key_words[5]),
-      set1_128(key_words[6]), set1_128(key_words[7]),
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
   };
   __m128i offset_low_vec, offset_high_vec;
   load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -710,15 +723,12 @@ INLINE void load_offsets8(uint64_t offset, const uint64_t deltas[8],
 }
 
 void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
-                         const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+                         const uint32_t key[8], uint64_t offset,
                          offset_deltas_t offset_deltas, uint8_t flags,
                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
-  uint32_t key_words[8];
-  memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian
   __m256i h_vecs[8] = {
-      set1_256(key_words[0]), set1_256(key_words[1]), set1_256(key_words[2]),
-      set1_256(key_words[3]), set1_256(key_words[4]), set1_256(key_words[5]),
-      set1_256(key_words[6]), set1_256(key_words[7]),
+      set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
+      set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
   };
   __m256i offset_low_vec, offset_high_vec;
   load_offsets8(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -1023,16 +1033,13 @@ INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16],
 }
 
 void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
-                          const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+                          const uint32_t key[8], uint64_t offset,
                           offset_deltas_t offset_deltas, uint8_t flags,
                           uint8_t flags_start, uint8_t flags_end,
                           uint8_t *out) {
-  uint32_t key_words[8];
-  memcpy(key_words, key, BLAKE3_KEY_LEN); // x86 is little-endian
   __m512i h_vecs[8] = {
-      set1_512(key_words[0]), set1_512(key_words[1]), set1_512(key_words[2]),
-      set1_512(key_words[3]), set1_512(key_words[4]), set1_512(key_words[5]),
-      set1_512(key_words[6]), set1_512(key_words[7]),
+      set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
+      set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
   };
   __m512i offset_low_vec, offset_high_vec;
   load_offsets16(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -1107,20 +1114,18 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
  */
 
 INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
-                            const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+                            const uint32_t key[8], uint64_t offset,
                             uint8_t flags, uint8_t flags_start,
                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
-  uint8_t cv[32];
+  uint32_t cv[8];
   memcpy(cv, key, BLAKE3_KEY_LEN);
   uint8_t block_flags = flags | flags_start;
   while (blocks > 0) {
     if (blocks == 1) {
       block_flags |= flags_end;
     }
-    uint8_t out[64];
-    blake3_compress_avx512(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags,
-                           out);
-    memcpy(cv, out, 32);
+    blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, offset,
+                                    block_flags);
     input = &input[BLAKE3_BLOCK_LEN];
     blocks -= 1;
     block_flags = flags;
@@ -1129,7 +1134,7 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
 }
 
 void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
-                             size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                             size_t blocks, const uint32_t key[8],
                              uint64_t offset, offset_deltas_t offset_deltas,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out) {
diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h
index d2e0325..af55d93 100644
--- a/src/c/blake3_impl.h
+++ b/src/c/blake3_impl.h
@@ -32,12 +32,6 @@ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
                                0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
                                0x1F83D9ABUL, 0x5BE0CD19UL};
 
-static const uint8_t IV_BYTES[32] = {
-    0x67, 0xe6, 0x09, 0x6a, 0x85, 0xae, 0x67, 0xbb, 0x72, 0xf3, 0x6e,
-    0x3c, 0x3a, 0xf5, 0x4f, 0xa5, 0x7f, 0x52, 0x0e, 0x51, 0x8c, 0x68,
-    0x05, 0x9b, 0xab, 0xd9, 0x83, 0x1f, 0x19, 0xcd, 0xe0, 0x5b,
-};
-
 static const uint8_t MSG_SCHEDULE[7][16] = {
     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
     {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
@@ -81,41 +75,71 @@ INLINE uint32_t offset_high(uint64_t offset) {
   return (uint32_t)(offset >> 32);
 }
 
+INLINE uint32_t load32(const void *src) {
+  const uint8_t *p = (const uint8_t *)src;
+  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+         ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+                           uint32_t key_words[8]) {
+  key_words[0] = load32(&key[0 * 4]);
+  key_words[1] = load32(&key[1 * 4]);
+  key_words[2] = load32(&key[2 * 4]);
+  key_words[3] = load32(&key[3 * 4]);
+  key_words[4] = load32(&key[4 * 4]);
+  key_words[5] = load32(&key[5 * 4]);
+  key_words[6] = load32(&key[6 * 4]);
+  key_words[7] = load32(&key[7 * 4]);
+}
+
 // Declarations for implementation-specific functions.
-void blake3_compress_portable(const uint8_t cv[BLAKE3_OUT_LEN],
-                              const uint8_t block[BLAKE3_BLOCK_LEN],
-                              uint8_t block_len, uint64_t offset, uint8_t flags,
-                              uint8_t out[64]);
-void blake3_compress_sse41(const uint8_t cv[BLAKE3_OUT_LEN],
-                           const uint8_t block[BLAKE3_BLOCK_LEN],
-                           uint8_t block_len, uint64_t offset, uint8_t flags,
-                           uint8_t out[64]);
-void blake3_compress_avx512(const uint8_t cv[BLAKE3_OUT_LEN],
-                            const uint8_t block[BLAKE3_BLOCK_LEN],
-                            uint8_t block_len, uint64_t offset, uint8_t flags,
-                            uint8_t out[64]);
+void blake3_compress_in_place_portable(uint32_t cv[8],
+                                       const uint8_t block[BLAKE3_BLOCK_LEN],
+                                       uint8_t block_len, uint64_t offset,
+                                       uint8_t flags);
+void blake3_compress_in_place_sse41(uint32_t cv[8],
+                                    const uint8_t block[BLAKE3_BLOCK_LEN],
+                                    uint8_t block_len, uint64_t offset,
+                                    uint8_t flags);
+void blake3_compress_in_place_avx512(uint32_t cv[8],
+                                     const uint8_t block[BLAKE3_BLOCK_LEN],
+                                     uint8_t block_len, uint64_t offset,
+                                     uint8_t flags);
+void blake3_compress_xof_portable(const uint32_t cv[8],
+                                  const uint8_t block[BLAKE3_BLOCK_LEN],
+                                  uint8_t block_len, uint64_t offset,
+                                  uint8_t flags, uint8_t out[64]);
+void blake3_compress_xof_sse41(const uint32_t cv[8],
+                               const uint8_t block[BLAKE3_BLOCK_LEN],
+                               uint8_t block_len, uint64_t offset,
+                               uint8_t flags, uint8_t out[64]);
+void blake3_compress_xof_avx512(const uint32_t cv[8],
+                                const uint8_t block[BLAKE3_BLOCK_LEN],
+                                uint8_t block_len, uint64_t offset,
+                                uint8_t flags, uint8_t out[64]);
 void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
-                               size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                               size_t blocks, const uint32_t key[8],
                                uint64_t offset, offset_deltas_t od,
                                uint8_t flags, uint8_t flags_start,
                                uint8_t flags_end, uint8_t *out);
 void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
-                            size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                            size_t blocks, const uint32_t key[8],
                             uint64_t offset, offset_deltas_t od, uint8_t flags,
                             uint8_t flags_start, uint8_t flags_end,
                             uint8_t *out);
 void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
-                           size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                           size_t blocks, const uint32_t key[8],
                            uint64_t offset, offset_deltas_t od, uint8_t flags,
                            uint8_t flags_start, uint8_t flags_end,
                            uint8_t *out);
 void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
-                             size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                             size_t blocks, const uint32_t key[8],
                              uint64_t offset, offset_deltas_t od, uint8_t flags,
                              uint8_t flags_start, uint8_t flags_end,
                              uint8_t *out);
 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
-                           size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                           size_t blocks, const uint32_t key[8],
                            uint64_t offset, offset_deltas_t od, uint8_t flags,
                            uint8_t flags_start, uint8_t flags_end,
                            uint8_t *out);
diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c
index 86a99bf..0ffa17e 100644
--- a/src/c/blake3_neon.c
+++ b/src/c/blake3_neon.c
@@ -223,17 +223,12 @@ INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
 }
 
 void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
-                       const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
+                       const uint32_t key[8], uint64_t offset,
                        offset_deltas_t offset_deltas, uint8_t flags,
                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
-  uint32_t key_words[8];
-  // TODO: This is assuming little-endian. Is there such a thing as NEON on
-  //       big-endian?
-  memcpy(key_words, key, BLAKE3_KEY_LEN);
   uint32x4_t h_vecs[8] = {
-      set1_128(key_words[0]), set1_128(key_words[1]), set1_128(key_words[2]),
-      set1_128(key_words[3]), set1_128(key_words[4]), set1_128(key_words[5]),
-      set1_128(key_words[6]), set1_128(key_words[7]),
+      set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
+      set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
   };
   uint32x4_t offset_low_vec, offset_high_vec;
   load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
@@ -294,10 +289,10 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
  */
 
 INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
-                          const uint8_t key[BLAKE3_KEY_LEN], uint64_t offset,
-                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
+                          const uint32_t key[8], uint64_t offset, uint8_t flags,
+                          uint8_t flags_start, uint8_t flags_end,
                           uint8_t out[BLAKE3_OUT_LEN]) {
-  uint8_t cv[32];
+  uint32_t cv[8];
   memcpy(cv, key, BLAKE3_KEY_LEN);
   uint8_t block_flags = flags | flags_start;
   while (blocks > 0) {
@@ -307,10 +302,8 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
     // TODO: Implement compress_neon. However note that according to
     // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
     // compress_neon might not be any faster than compress_portable.
-    uint8_t out[64];
-    blake3_compress_portable(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags,
-                             out);
-    memcpy(cv, out, 32);
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, offset,
+                                      block_flags);
     input = &input[BLAKE3_BLOCK_LEN];
     blocks -= 1;
     block_flags = flags;
@@ -319,7 +312,7 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
 }
 
 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
-                           size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
+                           size_t blocks, const uint32_t key[8],
                            uint64_t offset, offset_deltas_t offset_deltas,
                            uint8_t flags, uint8_t flags_start,
                            uint8_t flags_end, uint8_t *out) {
diff --git a/src/c/blake3_portable.c b/src/c/blake3_portable.c
deleted file mode 100644
index 6b58cf4..0000000
--- a/src/c/blake3_portable.c
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "blake3_impl.h"
-#include <string.h>
-
-INLINE uint32_t load32(const void *src) {
-  const uint8_t *p = (const uint8_t *)src;
-  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
-         ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
-}
-
-INLINE void store32(void *dst, uint32_t w) {
-  uint8_t *p = (uint8_t *)dst;
-  p[0] = (uint8_t)(w >> 0);
-  p[1] = (uint8_t)(w >> 8);
-  p[2] = (uint8_t)(w >> 16);
-  p[3] = (uint8_t)(w >> 24);
-}
-
-INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
-  return (w >> c) | (w << (32 - c));
-}
-
-INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
-              uint32_t x, uint32_t y) {
-  state[a] = state[a] + state[b] + x;
-  state[d] = rotr32(state[d] ^ state[a], 16);
-  state[c] = state[c] + state[d];
-  state[b] = rotr32(state[b] ^ state[c], 12);
-  state[a] = state[a] + state[b] + y;
-  state[d] = rotr32(state[d] ^ state[a], 8);
-  state[c] = state[c] + state[d];
-  state[b] = rotr32(state[b] ^ state[c], 7);
-}
-
-INLINE void round_fn(uint32_t *state, const uint32_t *msg, size_t round) {
-  // Select the message schedule based on the round.
-  const uint8_t *schedule = MSG_SCHEDULE[round];
-
-  // Mix the columns.
-  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
-  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
-  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
-  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
-
-  // Mix the rows.
-  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
-  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
-  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
-  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
-}
-
-void blake3_compress_portable(const uint8_t cv[BLAKE3_OUT_LEN],
-                              const uint8_t block[BLAKE3_BLOCK_LEN],
-                              uint8_t block_len, uint64_t offset, uint8_t flags,
-                              uint8_t out[64]) {
-  uint32_t block_words[16];
-  block_words[0] = load32(block + 4 * 0);
-  block_words[1] = load32(block + 4 * 1);
-  block_words[2] = load32(block + 4 * 2);
-  block_words[3] = load32(block + 4 * 3);
-  block_words[4] = load32(block + 4 * 4);
-  block_words[5] = load32(block + 4 * 5);
-  block_words[6] = load32(block + 4 * 6);
-  block_words[7] = load32(block + 4 * 7);
-  block_words[8] = load32(block + 4 * 8);
-  block_words[9] = load32(block + 4 * 9);
-  block_words[10] = load32(block + 4 * 10);
-  block_words[11] = load32(block + 4 * 11);
-  block_words[12] = load32(block + 4 * 12);
-  block_words[13] = load32(block + 4 * 13);
-  block_words[14] = load32(block + 4 * 14);
-  block_words[15] = load32(block + 4 * 15);
-
-  uint32_t state[16] = {
-      load32(&cv[0 * 4]),
-      load32(&cv[1 * 4]),
-      load32(&cv[2 * 4]),
-      load32(&cv[3 * 4]),
-      load32(&cv[4 * 4]),
-      load32(&cv[5 * 4]),
-      load32(&cv[6 * 4]),
-      load32(&cv[7 * 4]),
-      IV[0],
-      IV[1],
-      IV[2],
-      IV[3],
-      offset_low(offset),
-      offset_high(offset),
-      (uint32_t)block_len,
-      (uint32_t)flags,
-  };
-
-  round_fn(&state[0], &block_words[0], 0);
-  round_fn(&state[0], &block_words[0], 1);
-  round_fn(&state[0], &block_words[0], 2);
-  round_fn(&state[0], &block_words[0], 3);
-  round_fn(&state[0], &block_words[0], 4);
-  round_fn(&state[0], &block_words[0], 5);
-  round_fn(&state[0], &block_words[0], 6);
-
-  store32(&out[0 * 4], state[0] ^ state[8]);
-  store32(&out[1 * 4], state[1] ^ state[9]);
-  store32(&out[2 * 4], state[2] ^ state[10]);
-  store32(&out[3 * 4], state[3] ^ state[11]);
-  store32(&out[4 * 4], state[4] ^ state[12]);
-  store32(&out[5 * 4], state[5] ^ state[13]);
-  store32(&out[6 * 4], state[6] ^ state[14]);
-  store32(&out[7 * 4], state[7] ^ state[15]);
-  store32(&out[8 * 4], state[8] ^ cv[0]);
-  store32(&out[9 * 4], state[9] ^ cv[1]);
-  store32(&out[10 * 4], state[10] ^ cv[2]);
-  store32(&out[11 * 4], state[11] ^ cv[3]);
-  store32(&out[12 * 4], state[12] ^ cv[4]);
-  store32(&out[13 * 4], state[13] ^ cv[5]);
-  store32(&out[14 * 4], state[14] ^ cv[6]);
-  store32(&out[15 * 4], state[15] ^ cv[7]);
-}
-
-INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
-                              const uint8_t key[BLAKE3_KEY_LEN],
-                              uint64_t offset, uint8_t flags,
-                              uint8_t flags_start, uint8_t flags_end,
-                              uint8_t out[BLAKE3_OUT_LEN]) {
-  uint8_t cv[32];
-  memcpy(cv, key, 32);
-  uint8_t block_flags = flags | flags_start;
-  while (blocks > 0) {
-    if (blocks == 1) {
-      block_flags |= flags_end;
-    }
-    uint8_t out[64];
-    blake3_compress_portable(cv, input, BLAKE3_BLOCK_LEN, offset, block_flags,
-                             out);
-    memcpy(cv, out, 32);
-    input = &input[BLAKE3_BLOCK_LEN];
-    blocks -= 1;
-    block_flags = flags;
-  }
-  memcpy(out, cv, 32);
-}
-
-void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
-                               size_t blocks, const uint8_t key[BLAKE3_KEY_LEN],
-                               uint64_t offset, offset_deltas_t offset_deltas,
-                               uint8_t flags, uint8_t flags_start,
-                               uint8_t flags_end, uint8_t *out) {
-  while (num_inputs > 0) {
-    hash_one_portable(inputs[0], blocks, key, offset, flags, flags_start,
-                      flags_end, out);
-    inputs += 1;
-    num_inputs -= 1;
-    offset += offset_deltas[1];
-    out = &out[BLAKE3_OUT_LEN];
-  }
-}
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
index 4ea2542..0120a2c 100644
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@@ -1,17 +1,29 @@
-use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN};
+use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
+use arrayref::array_ref;
 
 pub const DEGREE: usize = 16;
 
 // Unsafe because this may only be called on platforms supporting AVX-512.
-pub unsafe fn compress(
-    cv: &[u8; 32],
+pub unsafe fn compress_in_place(
+    cv: &mut CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    offset: u64,
+    flags: u8,
+) {
+    ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, offset, flags)
+}
+
+// Unsafe because this may only be called on platforms supporting AVX-512.
+pub unsafe fn compress_xof(
+    cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
     offset: u64,
     flags: u8,
 ) -> [u8; 64] {
     let mut out = [0u8; 64];
-    ffi::blake3_compress_avx512(
+    ffi::blake3_compress_xof_avx512(
         cv.as_ptr(),
         block.as_ptr(),
         block_len,
@@ -25,7 +37,7 @@ pub unsafe fn compress(
 // Unsafe because this may only be called on platforms supporting AVX-512.
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -53,53 +65,26 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
 
 pub mod ffi {
     extern "C" {
-        pub fn blake3_compress_avx512(
-            cv: *const u8,
+        pub fn blake3_compress_in_place_avx512(
+            cv: *mut u32,
             block: *const u8,
             block_len: u8,
             offset: u64,
             flags: u8,
-            out: *mut u8,
-        );
-        // hash4/hash8/hash16 are exposed here for benchmarks.
-        pub fn blake3_hash4_avx512(
-            inputs: *const *const u8,
-            blocks: usize,
-            key: *const u8,
-            offset: u64,
-            offset_deltas: *const u64,
-            flags: u8,
-            flags_start: u8,
-            flags_end: u8,
-            out: *mut u8,
-        );
-        pub fn blake3_hash8_avx512(
-            inputs: *const *const u8,
-            blocks: usize,
-            key: *const u8,
-            offset: u64,
-            offset_deltas: *const u64,
-            flags: u8,
-            flags_start: u8,
-            flags_end: u8,
-            out: *mut u8,
         );
-        pub fn blake3_hash16_avx512(
-            inputs: *const *const u8,
-            blocks: usize,
-            key: *const u8,
+        pub fn blake3_compress_xof_avx512(
+            cv: *const u32,
+            block: *const u8,
+            block_len: u8,
             offset: u64,
-            offset_deltas: *const u64,
             flags: u8,
-            flags_start: u8,
-            flags_end: u8,
             out: *mut u8,
         );
         pub fn blake3_hash_many_avx512(
             inputs: *const *const u8,
             num_inputs: usize,
             blocks: usize,
-            key: *const u8,
+            key: *const u32,
             offset: u64,
             offset_deltas: *const u64,
             flags: u8,
@@ -119,7 +104,7 @@ mod test {
         if !crate::platform::avx512_detected() {
             return;
         }
-        crate::test::test_compress_fn(compress);
+        crate::test::test_compress_fn(compress_in_place, compress_xof);
     }
 
     #[test]
diff --git a/src/c_neon.rs b/src/c_neon.rs
index 15e76d2..029bf7e 100644
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@@ -1,11 +1,11 @@
-use crate::{OffsetDeltas, BLOCK_LEN, KEY_LEN, OUT_LEN};
+use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
 
 pub const DEGREE: usize = 4;
 
 // Unsafe because this may only be called on platforms supporting NEON.
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -31,25 +31,36 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     )
 }
 
+// blake3_neon.c normally depends on blake3_portable.c, because the NEON
+// implementation only provides 4x compression, and it relies on the portable
+// implementation for 1x compression. However, we expose the portable Rust
+// implementation here instead, to avoid linking in unnecessary code.
+#[no_mangle]
+pub extern "C" fn blake3_compress_in_place_portable(
+    cv: *mut u32,
+    block: *const u8,
+    block_len: u8,
+    offset: u64,
+    flags: u8,
+) {
+    unsafe {
+        crate::portable::compress_in_place(
+            &mut *(cv as *mut [u32; 8]),
+            &*(block as *const [u8; 64]),
+            block_len,
+            offset,
+            flags,
+        )
+    }
+}
+
 pub mod ffi {
     extern "C" {
-        // Exposed here for benchmarks.
-        pub fn blake3_hash4_neon(
-            inputs: *const *const u8,
-            blocks: usize,
-            key: *const u8,
-            offset: u64,
-            offset_deltas: *const u64,
-            flags: u8,
-            flags_start: u8,
-            flags_end: u8,
-            out: *mut u8,
-        );
         pub fn blake3_hash_many_neon(
             inputs: *const *const u8,
             num_inputs: usize,
             blocks: usize,
-            key: *const u8,
+            key: *const u32,
             offset: u64,
             offset_deltas: *const u64,
             flags: u8,
diff --git a/src/lib.rs b/src/lib.rs
index 9edcd78..8ccf7c5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -40,15 +40,18 @@ pub const BLOCK_LEN: usize = 64;
 #[doc(hidden)]
 pub const CHUNK_LEN: usize = 2048;
 
-const IV: &[u32; 8] = &[
+// While iterating the compression function within a chunk, the CV is
+// represented as words, to avoid doing two extra endianness conversions for
+// each compression in the portable implementation. But the hash_many interface
+// needs to hash both input bytes and parent nodes, so its better for its
+// output CVs to be represented as bytes.
+type CVWords = [u32; 8];
+type CVBytes = [u8; 32]; // little-endian
+
+const IV: &CVWords = &[
     0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
 ];
 
-const IV_BYTES: &[u8; 32] = &[
-    0x67, 0xe6, 0x09, 0x6a, 0x85, 0xae, 0x67, 0xbb, 0x72, 0xf3, 0x6e, 0x3c, 0x3a, 0xf5, 0x4f, 0xa5,
-    0x7f, 0x52, 0x0e, 0x51, 0x8c, 0x68, 0x05, 0x9b, 0xab, 0xd9, 0x83, 0x1f, 0x19, 0xcd, 0xe0, 0x5b,
-];
-
 const MSG_SCHEDULE: [[usize; 16]; 7] = [
     [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
     [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
@@ -173,7 +176,7 @@ impl fmt::Debug for Hash {
 // setting the ROOT flag, any number of final output bytes. The Output struct
 // captures the state just prior to choosing between those two possibilities.
 struct Output {
-    input_chaining_value: [u8; 32],
+    input_chaining_value: CVWords,
     block: [u8; 64],
     block_len: u8,
     offset: u64,
@@ -182,34 +185,31 @@ struct Output {
 }
 
 impl Output {
-    fn chaining_value(&self) -> [u8; 32] {
-        let out = self.platform.compress(
-            &self.input_chaining_value,
+    fn chaining_value(&self) -> CVBytes {
+        let mut cv = self.input_chaining_value;
+        self.platform.compress_in_place(
+            &mut cv,
             &self.block,
             self.block_len,
             self.offset,
             self.flags,
         );
-        *array_ref!(out, 0, 32)
+        platform::le_bytes_from_words_32(&cv)
     }
 
     fn root_hash(&self) -> Hash {
         debug_assert_eq!(self.offset, 0);
-        let out = self.platform.compress(
-            &self.input_chaining_value,
-            &self.block,
-            self.block_len,
-            0,
-            self.flags | ROOT,
-        );
-        Hash(*array_ref!(out, 0, 32))
+        let mut cv = self.input_chaining_value;
+        self.platform
+            .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
+        Hash(platform::le_bytes_from_words_32(&cv))
     }
 
     fn root_output_bytes(&self, out_slice: &mut [u8]) {
         debug_assert_eq!(self.offset, 0);
         let mut offset = 0;
         for out_block in out_slice.chunks_mut(2 * OUT_LEN) {
-            let out_bytes = self.platform.compress(
+            let out_bytes = self.platform.compress_xof(
                 &self.input_chaining_value,
                 &self.block,
                 self.block_len,
@@ -224,7 +224,7 @@ impl Output {
 
 #[derive(Clone)]
 struct ChunkState {
-    cv: [u8; 32],
+    cv: CVWords,
     offset: u64,
     buf: [u8; BLOCK_LEN],
     buf_len: u8,
@@ -234,7 +234,7 @@ struct ChunkState {
 }
 
 impl ChunkState {
-    fn new(key: &[u8; 32], offset: u64, flags: u8, platform: Platform) -> Self {
+    fn new(key: &CVWords, offset: u64, flags: u8, platform: Platform) -> Self {
         Self {
             cv: *key,
             offset,
@@ -246,7 +246,7 @@ impl ChunkState {
         }
     }
 
-    fn reset(&mut self, key: &[u8; KEY_LEN], new_offset: u64) {
+    fn reset(&mut self, key: &CVWords, new_offset: u64) {
         debug_assert_eq!(new_offset % CHUNK_LEN as u64, 0);
         self.cv = *key;
         self.offset = new_offset;
@@ -283,14 +283,13 @@ impl ChunkState {
             if !input.is_empty() {
                 debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
                 let block_flags = self.flags | self.start_flag(); // borrowck
-                let output = self.platform.compress(
-                    &self.cv,
+                self.platform.compress_in_place(
+                    &mut self.cv,
                     &self.buf,
                     BLOCK_LEN as u8,
                     self.offset,
                     block_flags,
                 );
-                self.cv = *array_ref!(output, 0, 32);
                 self.buf_len = 0;
                 self.buf = [0; BLOCK_LEN];
                 self.blocks_compressed += 1;
@@ -300,14 +299,13 @@ impl ChunkState {
         while input.len() > BLOCK_LEN {
             debug_assert_eq!(self.buf_len, 0);
             let block_flags = self.flags | self.start_flag(); // borrowck
-            let output = self.platform.compress(
-                &self.cv,
+            self.platform.compress_in_place(
+                &mut self.cv,
                 array_ref!(input, 0, BLOCK_LEN),
                 BLOCK_LEN as u8,
                 self.offset,
                 block_flags,
             );
-            self.cv = *array_ref!(output, 0, 32);
             self.blocks_compressed += 1;
             input = &input[BLOCK_LEN..];
         }
@@ -400,7 +398,7 @@ where
 // those cases use a different codepath.
 fn compress_chunks_parallel(
     input: &[u8],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     flags: u8,
     platform: Platform,
@@ -448,7 +446,7 @@ fn compress_chunks_parallel(
 // never empty; those cases use a different codepath.
 fn compress_parents_parallel(
     child_chaining_values: &[u8],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     flags: u8,
     platform: Platform,
     out: &mut [u8],
@@ -501,7 +499,7 @@ fn compress_parents_parallel(
 // codepath.
 fn compress_subtree_wide(
     input: &[u8],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     flags: u8,
     platform: Platform,
@@ -573,7 +571,7 @@ fn compress_subtree_wide(
 // chunk or less. That's a different codepath.
 fn compress_subtree_to_parent_node(
     input: &[u8],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     flags: u8,
     platform: Platform,
@@ -597,7 +595,7 @@ fn compress_subtree_to_parent_node(
 
 // Hash a complete input all at once. Unlike compress_subtree_wide() and
 // compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-fn hash_all_at_once(input: &[u8], key: &[u8; KEY_LEN], flags: u8) -> Output {
+fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
     let platform = Platform::detect();
 
     // If the whole subtree is one chunk, hash it directly with a ChunkState.
@@ -621,23 +619,25 @@ fn hash_all_at_once(input: &[u8], key: &[u8; KEY_LEN], flags: u8) -> Output {
 
 /// The default hash function.
 pub fn hash(input: &[u8]) -> Hash {
-    hash_all_at_once(input, IV_BYTES, 0).root_hash()
+    hash_all_at_once(input, IV, 0).root_hash()
 }
 
 /// The keyed hash function.
 pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
-    hash_all_at_once(input, key, KEYED_HASH).root_hash()
+    let key_words = platform::words_from_le_bytes_32(key);
+    hash_all_at_once(input, &key_words, KEYED_HASH).root_hash()
 }
 
 /// The key derivation function.
 pub fn derive_key(key: &[u8; KEY_LEN], context: &[u8]) -> Hash {
-    hash_all_at_once(context, key, DERIVE_KEY).root_hash()
+    let key_words = platform::words_from_le_bytes_32(key);
+    hash_all_at_once(context, &key_words, DERIVE_KEY).root_hash()
 }
 
 fn parent_node_output(
-    left_child: &[u8; 32],
-    right_child: &[u8; 32],
-    key: &[u8; KEY_LEN],
+    left_child: &CVBytes,
+    right_child: &CVBytes,
+    key: &CVWords,
     flags: u8,
     platform: Platform,
 ) -> Output {
@@ -658,14 +658,14 @@ fn parent_node_output(
 /// support for extendable output.
 #[derive(Clone)]
 pub struct Hasher {
-    key: [u8; KEY_LEN],
+    key: CVWords,
     chunk_state: ChunkState,
     // 2^53 * 2048 = 2^64
-    cv_stack: ArrayVec<[[u8; OUT_LEN]; 53]>,
+    cv_stack: ArrayVec<[CVBytes; 53]>,
 }
 
 impl Hasher {
-    fn new_internal(key: &[u8; 32], flags: u8) -> Self {
+    fn new_internal(key: &CVWords, flags: u8) -> Self {
         Self {
             key: *key,
             chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
@@ -675,12 +675,13 @@ impl Hasher {
 
     /// Construct a new `Hasher` for the regular hash function.
     pub fn new() -> Self {
-        Self::new_internal(IV_BYTES, 0)
+        Self::new_internal(IV, 0)
     }
 
     /// Construct a new `Hasher` for the keyed hash function.
     pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
-        Self::new_internal(key, KEYED_HASH)
+        let key_words = platform::words_from_le_bytes_32(key);
+        Self::new_internal(&key_words, KEYED_HASH)
     }
 
     /// Construct a new `Hasher` for the key derivation function.
@@ -691,7 +692,8 @@ impl Hasher {
     ///
     /// [`derive_key`]: fn.derive_key.html
     pub fn new_derive_key(key: &[u8; KEY_LEN]) -> Self {
-        Self::new_internal(key, DERIVE_KEY)
+        let key_words = platform::words_from_le_bytes_32(key);
+        Self::new_internal(&key_words, DERIVE_KEY)
     }
 
     /// The total number of input bytes so far.
@@ -705,19 +707,18 @@ impl Hasher {
         while self.cv_stack.len() > post_merge_stack_len {
             let right_child = self.cv_stack.pop().unwrap();
             let left_child = self.cv_stack.pop().unwrap();
-            let parent_cv = parent_node_output(
+            let parent_output = parent_node_output(
                 &left_child,
                 &right_child,
                 &self.key,
                 self.chunk_state.flags,
                 self.chunk_state.platform,
-            )
-            .chaining_value();
-            self.cv_stack.push(parent_cv);
+            );
+            self.cv_stack.push(parent_output.chaining_value());
         }
     }
 
-    fn push_cv(&mut self, new_cv: &[u8; 32], offset: u64) {
+    fn push_cv(&mut self, new_cv: &CVBytes, offset: u64) {
         // In reference_impl.rs, we merge the new CV with existing CVs from the
         // stack before pushing it. We can do that because we know more input
         // is coming, so we know none of the merges are root.
diff --git a/src/platform.rs b/src/platform.rs
index 4621fc5..b42f7b9 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,4 +1,5 @@
-use crate::{portable, OffsetDeltas, BLOCK_LEN, KEY_LEN};
+use crate::{portable, CVWords, OffsetDeltas, BLOCK_LEN};
+use arrayref::{array_mut_ref, array_ref};
 
 #[cfg(feature = "c_avx512")]
 use crate::c_avx512;
@@ -91,27 +92,55 @@ impl Platform {
         degree
     }
 
-    pub(crate) fn compress(
+    pub(crate) fn compress_in_place(
         &self,
-        cv: &[u8; 32],
+        cv: &mut CVWords,
+        block: &[u8; BLOCK_LEN],
+        block_len: u8,
+        offset: u64,
+        flags: u8,
+    ) {
+        match self {
+            Platform::Portable => portable::compress_in_place(cv, block, block_len, offset, flags),
+            // Safe because detect() checked for platform support.
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            Platform::SSE41 | Platform::AVX2 => unsafe {
+                sse41::compress_in_place(cv, block, block_len, offset, flags)
+            },
+            // Safe because detect() checked for platform support.
+            #[cfg(feature = "c_avx512")]
+            Platform::AVX512 => unsafe {
+                c_avx512::compress_in_place(cv, block, block_len, offset, flags)
+            },
+            // No NEON compress_in_place() implementation yet.
+            #[cfg(feature = "c_neon")]
+            Platform::NEON => portable::compress_in_place(cv, block, block_len, offset, flags),
+        }
+    }
+
+    pub(crate) fn compress_xof(
+        &self,
+        cv: &CVWords,
         block: &[u8; BLOCK_LEN],
         block_len: u8,
         offset: u64,
         flags: u8,
     ) -> [u8; 64] {
         match self {
-            Platform::Portable => portable::compress(cv, block, block_len, offset, flags),
+            Platform::Portable => portable::compress_xof(cv, block, block_len, offset, flags),
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress(cv, block, block_len, offset, flags)
+                sse41::compress_xof(cv, block, block_len, offset, flags)
             },
             // Safe because detect() checked for platform support.
             #[cfg(feature = "c_avx512")]
-            Platform::AVX512 => unsafe { c_avx512::compress(cv, block, block_len, offset, flags) },
-            // No NEON compress() implementation yet.
+            Platform::AVX512 => unsafe {
+                c_avx512::compress_xof(cv, block, block_len, offset, flags)
+            },
+            // No NEON compress_xof() implementation yet.
             #[cfg(feature = "c_neon")]
-            Platform::NEON => portable::compress(cv, block, block_len, offset, flags),
+            Platform::NEON => portable::compress_xof(cv, block, block_len, offset, flags),
         }
     }
 
@@ -128,7 +157,7 @@ impl Platform {
     pub(crate) fn hash_many<A: arrayvec::Array<Item = u8>>(
         &self,
         inputs: &[&A],
-        key: &[u8; KEY_LEN],
+        key: &CVWords,
         offset: u64,
         offset_deltas: &OffsetDeltas,
         flags: u8,
@@ -262,3 +291,75 @@ pub fn sse41_detected() -> bool {
     }
     false
 }
+
+#[inline(always)]
+pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
+    let mut out = [0; 8];
+    out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
+    out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
+    out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
+    out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
+    out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
+    out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
+    out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
+    out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
+    out
+}
+
+#[inline(always)]
+pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
+    let mut out = [0; 16];
+    out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
+    out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
+    out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
+    out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
+    out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
+    out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
+    out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
+    out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
+    out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
+    out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
+    out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
+    out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
+    out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
+    out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
+    out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
+    out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
+    out
+}
+
+#[inline(always)]
+pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
+    let mut out = [0; 32];
+    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+    out
+}
+
+#[inline(always)]
+pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
+    let mut out = [0; 64];
+    *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
+    *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
+    *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
+    *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
+    *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
+    *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
+    *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
+    *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
+    *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
+    *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
+    *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
+    *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
+    *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
+    *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
+    *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
+    *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
+    out
+}
diff --git a/src/portable.rs b/src/portable.rs
index b07c46a..fa0e17d 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -1,4 +1,6 @@
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+    offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
 use arrayref::{array_mut_ref, array_ref};
 
 #[inline(always)]
@@ -31,41 +33,25 @@ fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
     g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
 }
 
-pub fn compress(
-    cv: &[u8; 32],
+#[inline(always)]
+fn compress_pre(
+    cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
     offset: u64,
     flags: u8,
-) -> [u8; 64] {
-    let block_words = [
-        u32::from_le_bytes(*array_ref!(block, 0 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 1 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 2 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 3 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 4 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 5 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 6 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 7 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 8 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 9 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 10 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 11 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 12 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 13 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 14 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(block, 15 * 4, 4)),
-    ];
+) -> [u32; 16] {
+    let block_words = crate::platform::words_from_le_bytes_64(block);
 
     let mut state = [
-        u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4)),
-        u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4)),
+        cv[0],
+        cv[1],
+        cv[2],
+        cv[3],
+        cv[4],
+        cv[5],
+        cv[6],
+        cv[7],
         IV[0],
         IV[1],
         IV[2],
@@ -84,6 +70,36 @@ pub fn compress(
     round(&mut state, &block_words, 5);
     round(&mut state, &block_words, 6);
 
+    state
+}
+
+pub fn compress_in_place(
+    cv: &mut CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    offset: u64,
+    flags: u8,
+) {
+    let state = compress_pre(cv, block, block_len, offset, flags);
+
+    cv[0] = state[0] ^ state[8];
+    cv[1] = state[1] ^ state[9];
+    cv[2] = state[2] ^ state[10];
+    cv[3] = state[3] ^ state[11];
+    cv[4] = state[4] ^ state[12];
+    cv[5] = state[5] ^ state[13];
+    cv[6] = state[6] ^ state[14];
+    cv[7] = state[7] ^ state[15];
+}
+
+pub fn compress_xof(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    offset: u64,
+    flags: u8,
+) -> [u8; 64] {
+    let mut state = compress_pre(cv, block, block_len, offset, flags);
     state[0] ^= state[8];
     state[1] ^= state[9];
     state[2] ^= state[10];
@@ -92,43 +108,25 @@ pub fn compress(
     state[5] ^= state[13];
     state[6] ^= state[14];
     state[7] ^= state[15];
-    state[8] ^= u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4));
-    state[9] ^= u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4));
-    state[10] ^= u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4));
-    state[11] ^= u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4));
-    state[12] ^= u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4));
-    state[13] ^= u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4));
-    state[14] ^= u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4));
-    state[15] ^= u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4));
-
-    let mut out = [0; 64];
-    out[0 * 4..][..4].copy_from_slice(&state[0].to_le_bytes());
-    out[1 * 4..][..4].copy_from_slice(&state[1].to_le_bytes());
-    out[2 * 4..][..4].copy_from_slice(&state[2].to_le_bytes());
-    out[3 * 4..][..4].copy_from_slice(&state[3].to_le_bytes());
-    out[4 * 4..][..4].copy_from_slice(&state[4].to_le_bytes());
-    out[5 * 4..][..4].copy_from_slice(&state[5].to_le_bytes());
-    out[6 * 4..][..4].copy_from_slice(&state[6].to_le_bytes());
-    out[7 * 4..][..4].copy_from_slice(&state[7].to_le_bytes());
-    out[8 * 4..][..4].copy_from_slice(&state[8].to_le_bytes());
-    out[9 * 4..][..4].copy_from_slice(&state[9].to_le_bytes());
-    out[10 * 4..][..4].copy_from_slice(&state[10].to_le_bytes());
-    out[11 * 4..][..4].copy_from_slice(&state[11].to_le_bytes());
-    out[12 * 4..][..4].copy_from_slice(&state[12].to_le_bytes());
-    out[13 * 4..][..4].copy_from_slice(&state[13].to_le_bytes());
-    out[14 * 4..][..4].copy_from_slice(&state[14].to_le_bytes());
-    out[15 * 4..][..4].copy_from_slice(&state[15].to_le_bytes());
-    out
+    state[8] ^= cv[0];
+    state[9] ^= cv[1];
+    state[10] ^= cv[2];
+    state[11] ^= cv[3];
+    state[12] ^= cv[4];
+    state[13] ^= cv[5];
+    state[14] ^= cv[6];
+    state[15] ^= cv[7];
+    crate::platform::le_bytes_from_words_64(&state)
 }
 
 pub fn hash1<A: arrayvec::Array<Item = u8>>(
     input: &A,
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
-    out: &mut [u8; OUT_LEN],
+    out: &mut CVBytes,
 ) {
     debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
     let mut cv = *key;
@@ -138,23 +136,22 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
         if slice.len() == BLOCK_LEN {
             block_flags |= flags_end;
         }
-        let output = compress(
-            &cv,
+        compress_in_place(
+            &mut cv,
             array_ref!(slice, 0, BLOCK_LEN),
             BLOCK_LEN as u8,
             offset,
             block_flags,
         );
-        cv = *array_ref!(output, 0, 32);
         block_flags = flags;
         slice = &slice[BLOCK_LEN..];
     }
-    *out = cv;
+    *out = crate::platform::le_bytes_from_words_32(&cv);
 }
 
 pub fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     mut offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -182,12 +179,12 @@ pub mod test {
     use super::*;
 
     // This is basically testing the portable implementation against itself,
-    // but we do it anyway for completeness. Other implementations will test
-    // themselves against portable. We also have several tests against the
-    // reference implementation in test.rs.
+    // but it also checks that compress_in_place and compress_xof are
+    // consistent. And there are tests against the reference implementation and
+    // against hardcoded test vectors elsewhere.
     #[test]
     fn test_compress() {
-        crate::test::test_compress_fn(compress);
+        crate::test::test_compress_fn(compress_in_place, compress_xof);
     }
 
     // Ditto.
diff --git a/src/sse41.rs b/src/sse41.rs
index a95d8a2..e45b22e 100644
--- a/src/sse41.rs
+++ b/src/sse41.rs
@@ -3,7 +3,9 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+    offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
 use arrayref::{array_mut_ref, array_ref, mut_array_refs};
 
 pub const DEGREE: usize = 4;
@@ -122,16 +124,16 @@ unsafe fn undiagonalize(row1: &mut __m128i, row3: &mut __m128i, row4: &mut __m12
     *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(2, 1, 0, 3));
 }
 
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn compress(
-    cv: &[u8; 32],
+#[inline(always)]
+unsafe fn compress_pre(
+    cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
     offset: u64,
     flags: u8,
-) -> [u8; 64] {
-    let row1 = &mut loadu(&cv[0]);
-    let row2 = &mut loadu(&cv[16]);
+) -> [__m128i; 4] {
+    let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8);
+    let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8);
     let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
     let row4 = &mut set4(
         offset_low(offset),
@@ -303,12 +305,37 @@ pub unsafe fn compress(
     g2(row1, row2, row3, row4, buf);
     undiagonalize(row1, row3, row4);
 
-    *row1 = xor(*row1, *row3);
-    *row2 = xor(*row2, *row4);
-    *row3 = xor(*row3, loadu(&cv[0]));
-    *row4 = xor(*row4, loadu(&cv[16]));
+    [*row1, *row2, *row3, *row4]
+}
+
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress_in_place(
+    cv: &mut CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    offset: u64,
+    flags: u8,
+) {
+    let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, offset, flags);
+    storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8);
+    storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8);
+}
 
-    core::mem::transmute([*row1, *row2, *row3, *row4]) // x86 is little-endian
+#[target_feature(enable = "sse4.1")]
+pub unsafe fn compress_xof(
+    cv: &CVWords,
+    block: &[u8; BLOCK_LEN],
+    block_len: u8,
+    offset: u64,
+    flags: u8,
+) -> [u8; 64] {
+    let [mut row1, mut row2, mut row3, mut row4] =
+        compress_pre(cv, block, block_len, offset, flags);
+    row1 = xor(row1, row3);
+    row2 = xor(row2, row4);
+    row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8));
+    row4 = xor(row4, loadu(cv.as_ptr().add(4) as *const u8));
+    core::mem::transmute([row1, row2, row3, row4])
 }
 
 #[inline(always)]
@@ -500,7 +527,7 @@ unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m128i, _
 pub unsafe fn hash4(
     inputs: &[*const u8; DEGREE],
     blocks: usize,
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -508,16 +535,15 @@ pub unsafe fn hash4(
     flags_end: u8,
     out: &mut [u8; DEGREE * OUT_LEN],
 ) {
-    let key_words: [u32; 8] = core::mem::transmute(*key); // x86 is little-endian
     let mut h_vecs = [
-        set1(key_words[0]),
-        set1(key_words[1]),
-        set1(key_words[2]),
-        set1(key_words[3]),
-        set1(key_words[4]),
-        set1(key_words[5]),
-        set1(key_words[6]),
-        set1(key_words[7]),
+        set1(key[0]),
+        set1(key[1]),
+        set1(key[2]),
+        set1(key[3]),
+        set1(key[4]),
+        set1(key[5]),
+        set1(key[6]),
+        set1(key[7]),
     ];
     let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
     let mut block_flags = flags | flags_start;
@@ -589,12 +615,12 @@ pub unsafe fn hash4(
 #[target_feature(enable = "sse4.1")]
 unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
     input: &A,
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
-    out: &mut [u8; OUT_LEN],
+    out: &mut CVBytes,
 ) {
     debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
     let mut cv = *key;
@@ -604,24 +630,23 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
         if slice.len() == BLOCK_LEN {
             block_flags |= flags_end;
         }
-        let out = compress(
-            &cv,
+        compress_in_place(
+            &mut cv,
             array_ref!(slice, 0, BLOCK_LEN),
             BLOCK_LEN as u8,
             offset,
             block_flags,
         );
-        cv = *array_ref!(out, 0, 32);
         block_flags = flags;
         slice = &slice[BLOCK_LEN..];
     }
-    *out = cv;
+    *out = core::mem::transmute(cv); // x86 is little-endian
 }
 
 #[target_feature(enable = "sse4.1")]
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     mut inputs: &[&A],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     mut offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -705,7 +730,7 @@ mod test {
         if !crate::platform::sse41_detected() {
             return;
         }
-        crate::test::test_compress_fn(compress);
+        crate::test::test_compress_fn(compress_in_place, compress_xof);
     }
 
     #[test]
diff --git a/src/test.rs b/src/test.rs
index 9d710d2..e5ee4e2 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1,4 +1,4 @@
-use crate::{OffsetDeltas, BLOCK_LEN, CHUNK_LEN, KEY_LEN, OUT_LEN};
+use crate::{CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
 use arrayref::array_ref;
 use arrayvec::ArrayVec;
 use core::usize;
@@ -31,7 +31,11 @@ pub const TEST_CASES: &[usize] = &[
 
 pub const TEST_CASES_MAX: usize = 31 * CHUNK_LEN;
 
-pub const TEST_KEY: [u8; crate::KEY_LEN] = *b"whats the Elvish word for friend";
+// There's a test to make sure these two are equal below.
+pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
+pub const TEST_KEY_WORDS: CVWords = [
+    1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
+];
 
 // Paint the input with a repeating byte pattern. We use a cycle length of 251,
 // because that's the largets prime number less than 256. This makes it
@@ -43,8 +47,11 @@ pub fn paint_test_input(buf: &mut [u8]) {
     }
 }
 
-type CompressFn = unsafe fn(
-    cv: &[u8; 32],
+type CompressInPlaceFn =
+    unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8);
+
+type CompressXofFn = unsafe fn(
+    cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
     offset: u64,
@@ -52,8 +59,8 @@ type CompressFn = unsafe fn(
 ) -> [u8; 64];
 
 // A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_fn: CompressFn) {
-    let initial_state = *b"IV for compression tests <('.')>";
+pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
+    let initial_state = TEST_KEY_WORDS;
     let block_len: u8 = 61;
     let mut block = [0; BLOCK_LEN];
     paint_test_input(&mut block[..block_len as usize]);
@@ -62,16 +69,21 @@ pub fn test_compress_fn(compress_fn: CompressFn) {
     let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
 
     let portable_out =
-        crate::portable::compress(&initial_state, &block, block_len, offset as u64, flags);
+        crate::portable::compress_xof(&initial_state, &block, block_len, offset as u64, flags);
 
-    let test_out = unsafe { compress_fn(&initial_state, &block, block_len, offset as u64, flags) };
+    let mut test_state = initial_state;
+    unsafe { compress_in_place_fn(&mut test_state, &block, block_len, offset as u64, flags) };
+    let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
+    let test_xof =
+        unsafe { compress_xof_fn(&initial_state, &block, block_len, offset as u64, flags) };
 
-    assert_eq!(&portable_out[..], &test_out[..]);
+    assert_eq!(&portable_out[..32], &test_state_bytes[..]);
+    assert_eq!(&portable_out[..], &test_xof[..]);
 }
 
 type HashManyFn<A> = unsafe fn(
     inputs: &[&A],
-    key: &[u8; KEY_LEN],
+    key: &CVWords,
     offset: u64,
     offset_deltas: &OffsetDeltas,
     flags: u8,
@@ -100,7 +112,7 @@ pub fn test_hash_many_fn(
     let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
     crate::portable::hash_many(
         &chunks,
-        &TEST_KEY,
+        &TEST_KEY_WORDS,
         offset,
         crate::CHUNK_OFFSET_DELTAS,
         crate::DERIVE_KEY,
@@ -113,7 +125,7 @@ pub fn test_hash_many_fn(
     unsafe {
         hash_many_chunks_fn(
             &chunks[..],
-            &TEST_KEY,
+            &TEST_KEY_WORDS,
             offset,
             crate::CHUNK_OFFSET_DELTAS,
             crate::DERIVE_KEY,
@@ -139,7 +151,7 @@ pub fn test_hash_many_fn(
     let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
     crate::portable::hash_many(
         &parents,
-        &TEST_KEY,
+        &TEST_KEY_WORDS,
         0,
         crate::PARENT_OFFSET_DELTAS,
         crate::DERIVE_KEY | crate::PARENT,
@@ -152,7 +164,7 @@ pub fn test_hash_many_fn(
     unsafe {
         hash_many_parents_fn(
             &parents[..],
-            &TEST_KEY,
+            &TEST_KEY_WORDS,
             0,
             crate::PARENT_OFFSET_DELTAS,
             crate::DERIVE_KEY | crate::PARENT,
@@ -172,6 +184,14 @@ pub fn test_hash_many_fn(
 }
 
 #[test]
+fn test_key_bytes_equal_key_words() {
+    assert_eq!(
+        TEST_KEY_WORDS,
+        crate::platform::words_from_le_bytes_32(&TEST_KEY),
+    );
+}
+
+#[test]
 fn test_reference_impl_size() {
     // Because the Rust compiler optimizes struct layout, it's possible that
     // some future version of the compiler will produce a different size. If
author	Jack O'Connor <[email protected]>	2019-12-10 14:20:09 -0500
committer	Jack O'Connor <[email protected]>	2019-12-11 18:05:26 -0500
commit	52ea6487f88a0e5cbc2f784f3095539afe6c91e4 (patch)
tree	181508c1840c2961e530e982c4525029d79e5685 /src
parent	d68882da0d897c93a271a7c0f6d6b9b13d13aa16 (diff)