WIP i don't remember what this is

author: Jack O'Connor <[email protected]> 2022-11-21 13:23:20 -0800
committer: Jack O'Connor <[email protected]> 2022-11-21 13:23:20 -0800
commit: 1ef99db193cdcac9d36e2e6f4737cde5ac0c4a8c (patch)
tree: fcd92903ad73ea1d7fa19d0ae26cffc41fc94f36
parent: 0ab6dbcc4740ef2d5c2b9a69cb68d5d09ddaf290 (diff)
2 files changed, 55 insertions, 18 deletions
diff --git a/benches/bench.rs b/benches/bench.rs
index 32e6319..df88f29 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -376,7 +376,10 @@ fn bench_atonce_1024_kib(b: &mut Bencher) {
 
 fn bench_incremental(b: &mut Bencher, len: usize) {
     let mut input = RandomInput::new(b, len);
-    b.iter(|| blake3::Hasher::new().update(input.get()).finalize());
+    let mut state = blake3::Hasher::new();
+    b.iter(|| {
+        state.update(input.get());
+    });
 }
 
 #[bench]
diff --git a/src/kernel2.rs b/src/kernel2.rs
index e68453d..830699e 100644
--- a/src/kernel2.rs
+++ b/src/kernel2.rs
@@ -848,6 +848,7 @@ unsafe fn load_transposed_16(input: *const u8) -> [__m512i; 16] {
     // Because operations that cross 128-bit lanes are relatively expensive, we split each 512-bit
     // load into four 128-bit loads. This results in vectors like:
     // a0, a1, a2, a3, e0, e1, e2, e3, i0, i1, i2, i3, m0, m1, m2, m3
+    #[inline(always)]
     unsafe fn load_4_lanes(input: *const u8) -> __m512i {
         let lane0 = _mm_loadu_epi32(input.add(0 * CHUNK_LEN) as *const i32);
         let lane1 = _mm_loadu_epi32(input.add(4 * CHUNK_LEN) as *const i32);
@@ -950,20 +951,20 @@ fn test_load_transpose_16() {
 
 #[target_feature(enable = "avx512f,avx512vl")]
 pub unsafe fn chunks_16(
-    message: &[u8; 16 * CHUNK_LEN],
-    key: &[u32; 8],
-    counter: u64,
-    flags: u32,
+    _message: &[u8; 16 * CHUNK_LEN],
+    _key: &[u32; 8],
+    _counter: u64,
+    _flags: u32,
 ) -> [__m512i; 8] {
     todo!();
 }
 
 #[target_feature(enable = "avx512f,avx512vl")]
 pub unsafe fn parents_16(
-    left_children: &[__m512i; 8],
-    right_children: &[__m512i; 8],
-    key: &[u32; 8],
-    flags: u32,
+    _left_children: &[__m512i; 8],
+    _right_children: &[__m512i; 8],
+    _key: &[u32; 8],
+    _flags: u32,
 ) -> [__m512i; 8] {
     todo!();
 }
@@ -1053,6 +1054,8 @@ unsafe fn xof_inner_16(
         state[i + 8] = _mm512_xor_si512(state[i + 8], _mm512_set1_epi32(cv[i] as i32));
     }
 
+    dbg!(std::mem::transmute::<_, [u8; 64]>(state[0]));
+
     // Partially untranspose the state vectors. We'll use the same trick here as with message
     // loading, where we avoid doing any relatively expensive cross-128-bit-lane operations, and
     // instead we delay reordering 128-bit lanes until the store step.
@@ -1076,7 +1079,7 @@ unsafe fn xof_inner_16(
     let abefijmn_ef = _mm512_unpacklo_epi32(state[14], state[15]);
     let cdghklop_ef = _mm512_unpackhi_epi32(state[14], state[15]);
 
-    // Finally, interleave 64-bit words. This gives us our intermediate goal, which is vectors like:
+    // Interleave 64-bit words. This gives us our intermediate goal, which is vectors like:
     // a0, a1, a2, a3, e0, e1, e2, e3, i0, i1, i2, i3, m0, m1, m2, m3
     [
         _mm512_unpacklo_epi64(abefijmn_01, abefijmn_23), // aeim_0123
@@ -1107,15 +1110,46 @@ pub unsafe fn xof_16(
     flags: u32,
     output: &mut [u8; BLOCK_LEN * 16],
 ) {
-    let vecs = xof_inner_16(block, cv, counter, block_len, flags);
-    for i in 0..vecs.len() {
-        let dest = output.as_mut_ptr().add(64 * i);
-        _mm512_storeu_si512(dest as *mut i32, vecs[i]);
+    unsafe fn write_4_lanes<const LANE: i32>(vecs: &[__m512i; 16], first_vec: usize, out: *mut u8) {
+        _mm_storeu_epi32(
+            out.add(0 * 16) as *mut i32,
+            _mm512_extracti32x4_epi32::<LANE>(vecs[first_vec + 0]),
+        );
+        _mm_storeu_epi32(
+            out.add(1 * 16) as *mut i32,
+            _mm512_extracti32x4_epi32::<LANE>(vecs[first_vec + 4]),
+        );
+        _mm_storeu_epi32(
+            out.add(2 * 16) as *mut i32,
+            _mm512_extracti32x4_epi32::<LANE>(vecs[first_vec + 8]),
+        );
+        _mm_storeu_epi32(
+            out.add(3 * 16) as *mut i32,
+            _mm512_extracti32x4_epi32::<LANE>(vecs[first_vec + 12]),
+        );
     }
+
+    let vecs = xof_inner_16(block, cv, counter, block_len, flags);
+    write_4_lanes::<0>(&vecs, 0, output.as_mut_ptr().add(0 * 64));
+    write_4_lanes::<0>(&vecs, 1, output.as_mut_ptr().add(1 * 64));
+    write_4_lanes::<0>(&vecs, 2, output.as_mut_ptr().add(2 * 64));
+    write_4_lanes::<0>(&vecs, 3, output.as_mut_ptr().add(3 * 64));
+    write_4_lanes::<1>(&vecs, 0, output.as_mut_ptr().add(4 * 64));
+    write_4_lanes::<1>(&vecs, 1, output.as_mut_ptr().add(5 * 64));
+    write_4_lanes::<1>(&vecs, 2, output.as_mut_ptr().add(6 * 64));
+    write_4_lanes::<1>(&vecs, 3, output.as_mut_ptr().add(7 * 64));
+    write_4_lanes::<2>(&vecs, 0, output.as_mut_ptr().add(8 * 64));
+    write_4_lanes::<2>(&vecs, 1, output.as_mut_ptr().add(9 * 64));
+    write_4_lanes::<2>(&vecs, 2, output.as_mut_ptr().add(10 * 64));
+    write_4_lanes::<2>(&vecs, 3, output.as_mut_ptr().add(11 * 64));
+    write_4_lanes::<3>(&vecs, 0, output.as_mut_ptr().add(12 * 64));
+    write_4_lanes::<3>(&vecs, 1, output.as_mut_ptr().add(13 * 64));
+    write_4_lanes::<3>(&vecs, 2, output.as_mut_ptr().add(14 * 64));
+    write_4_lanes::<3>(&vecs, 3, output.as_mut_ptr().add(15 * 64));
 }
 
 #[test]
-fn test_xor_16() {
+fn test_xof_16() {
     if !crate::platform::avx512_detected() {
         return;
     }
@@ -1128,18 +1162,18 @@ fn test_xor_16() {
     unsafe {
         xof_16(&block, IV, counter, block_len, flags, &mut output);
     }
+    dbg!(output);
 
-    let mut incrementing_counter = counter;
     for i in 0..16 {
+        dbg!(i);
         let expected = crate::portable::compress_xof(
             IV,
             &block,
             block_len as u8,
-            incrementing_counter,
+            counter + i as u64,
             flags as u8,
         );
         assert_eq!(expected, output[BLOCK_LEN * i..][..BLOCK_LEN]);
-        incrementing_counter += 1;
     }
 }
author	Jack O'Connor <[email protected]>	2022-11-21 13:23:20 -0800
committer	Jack O'Connor <[email protected]>	2022-11-21 13:23:20 -0800
commit	1ef99db193cdcac9d36e2e6f4737cde5ac0c4a8c (patch)
tree	fcd92903ad73ea1d7fa19d0ae26cffc41fc94f36
parent	0ab6dbcc4740ef2d5c2b9a69cb68d5d09ddaf290 (diff)