diff options
| author | Jack O'Connor <[email protected]> | 2022-12-17 12:37:24 -0800 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-12-17 13:33:13 -0800 |
| commit | 0c80427419382e696f91ffcae3ae3157f2bfe768 (patch) | |
| tree | f89c5cf115ff71ee9ad9ca568eacbaf618351889 | |
| parent | c9c63d54dcd96d7ff11e17ece98b03fe9db8aa8b (diff) | |
kernel2::parents_16
| -rw-r--r-- | benches/bench.rs | 35 | ||||
| -rw-r--r-- | src/kernel.rs | 2 | ||||
| -rw-r--r-- | src/kernel2.rs | 121 |
3 files changed, 149 insertions, 9 deletions
diff --git a/benches/bench.rs b/benches/bench.rs index 77003ba..6471a0f 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,4 +1,5 @@ #![feature(test)] +#![feature(stdsimd)] extern crate test; @@ -8,6 +9,7 @@ use blake3::guts::{BLOCK_LEN, CHUNK_LEN}; use blake3::platform::{Platform, MAX_SIMD_DEGREE}; use blake3::OUT_LEN; use rand::prelude::*; +use std::mem; use test::Bencher; const KIB: usize = 1024; @@ -293,8 +295,8 @@ fn bench_many_parents_kernel(b: &mut Bencher) { return; } use blake3::kernel::Words16; - let size = 16 * std::mem::size_of::<Words16>(); - let alignment = std::mem::align_of::<Words16>(); + let size = 16 * mem::size_of::<Words16>(); + let alignment = mem::align_of::<Words16>(); assert_eq!(alignment, 64); let mut input = RandomInput::new_aligned(b, size, alignment); for _ in 0..100 { @@ -676,7 +678,7 @@ fn bench_chunks16_kernel2(b: &mut Bencher) { return; } let mut input = RandomInput::new(b, 16 * CHUNK_LEN); - let key_words = [0; 8]; + let key_words = [42; 8]; let counter = 0; let flags = 0; b.iter(|| unsafe { @@ -686,6 +688,33 @@ fn bench_chunks16_kernel2(b: &mut Bencher) { } #[bench] +fn bench_parents16_kernel2(b: &mut Bencher) { + if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") { + return; + } + b.bytes = 16 * BLOCK_LEN as u64; + let mut random_parent_bytes = [0; 16 * 64]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut random_parent_bytes); + let left_parent_bytes: [u8; 16 * 32] = random_parent_bytes[..16 * 32].try_into().unwrap(); + let right_parent_bytes: [u8; 16 * 32] = random_parent_bytes[16 * 32..].try_into().unwrap(); + let left_parent_vectors: [core::arch::x86_64::__m512i; 8] = + unsafe { mem::transmute(left_parent_bytes) }; + let right_parent_vectors: [core::arch::x86_64::__m512i; 8] = + unsafe { mem::transmute(right_parent_bytes) }; + let key_words = [42; 8]; + let flags = 0; + b.iter(|| unsafe { + blake3::kernel2::parents_16( + &left_parent_vectors, + &right_parent_vectors, + &key_words, + flags, + ); + }); +} + +#[bench] fn bench_xof_kernel2(b: &mut Bencher) { if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") { return; diff --git a/src/kernel.rs b/src/kernel.rs index 0d7a5b3..b9a1dcb 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -2448,7 +2448,7 @@ global_asm!( "BLAKE3_AVX512_ODD_INDEXES:", ".long 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31", "blake3_avx512_parents_16:", - // The first 8 out of 16 input message vectors, which are the transposed CVs of the first 8 + // The first 8 out of 16 input message vectors, which are the transposed CVs of the first 16 // children, come in looking like this: // // a0, b0, c0, d0, e0, f0, g0, h0, i0, j0, k0, l0, m0, n0, o0, p0 diff --git a/src/kernel2.rs b/src/kernel2.rs index 907d2c7..8527f99 100644 --- a/src/kernel2.rs +++ b/src/kernel2.rs @@ -1115,12 +1115,123 @@ fn test_chunks_16() { #[target_feature(enable = "avx512f,avx512vl")] pub unsafe fn parents_16( - _left_children: &[__m512i; 8], - _right_children: &[__m512i; 8], - _key: &[u32; 8], - _flags: u32, + left_children: &[__m512i; 8], + right_children: &[__m512i; 8], + key: &[u32; 8], + flags: u32, ) -> [__m512i; 8] { - todo!(); + let even_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + let odd_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + let mut state_regs = [ + _mm512_set1_epi32(key[0] as i32), + _mm512_set1_epi32(key[1] as i32), + _mm512_set1_epi32(key[2] as i32), + _mm512_set1_epi32(key[3] as i32), + _mm512_set1_epi32(key[4] as i32), + _mm512_set1_epi32(key[5] as i32), + _mm512_set1_epi32(key[6] as i32), + _mm512_set1_epi32(key[7] as i32), + ]; + asm!( + "call blake3_avx512_kernel2_16", + inout("zmm0") state_regs[0], + inout("zmm1") state_regs[1], + inout("zmm2") state_regs[2], + inout("zmm3") state_regs[3], + inout("zmm4") state_regs[4], + inout("zmm5") state_regs[5], + inout("zmm6") state_regs[6], + inout("zmm7") state_regs[7], + in("zmm8") _mm512_set1_epi32(IV[0] as i32), + in("zmm9") _mm512_set1_epi32(IV[1] as i32), + in("zmm10") _mm512_set1_epi32(IV[2] as i32), + in("zmm11") _mm512_set1_epi32(IV[3] as i32), + in("zmm12") _mm512_set1_epi32(0), + in("zmm13") _mm512_set1_epi32(0), + in("zmm14") _mm512_set1_epi32(BLOCK_LEN as i32), + in("zmm15") _mm512_set1_epi32((flags | crate::PARENT as u32) as i32), + in("zmm16") _mm512_permutex2var_epi32(left_children[0], even_indexes, right_children[0]), + in("zmm17") _mm512_permutex2var_epi32(left_children[1], even_indexes, right_children[1]), + in("zmm18") _mm512_permutex2var_epi32(left_children[2], even_indexes, right_children[2]), + in("zmm19") _mm512_permutex2var_epi32(left_children[3], even_indexes, right_children[3]), + in("zmm20") _mm512_permutex2var_epi32(left_children[4], even_indexes, right_children[4]), + in("zmm21") _mm512_permutex2var_epi32(left_children[5], even_indexes, right_children[5]), + in("zmm22") _mm512_permutex2var_epi32(left_children[6], even_indexes, right_children[6]), + in("zmm23") _mm512_permutex2var_epi32(left_children[7], even_indexes, right_children[7]), + in("zmm24") _mm512_permutex2var_epi32(left_children[0], odd_indexes, right_children[0]), + in("zmm25") _mm512_permutex2var_epi32(left_children[1], odd_indexes, right_children[1]), + in("zmm26") _mm512_permutex2var_epi32(left_children[2], odd_indexes, right_children[2]), + in("zmm27") _mm512_permutex2var_epi32(left_children[3], odd_indexes, right_children[3]), + in("zmm28") _mm512_permutex2var_epi32(left_children[4], odd_indexes, right_children[4]), + in("zmm29") _mm512_permutex2var_epi32(left_children[5], odd_indexes, right_children[5]), + in("zmm30") _mm512_permutex2var_epi32(left_children[6], odd_indexes, right_children[6]), + in("zmm31") _mm512_permutex2var_epi32(left_children[7], odd_indexes, right_children[7]), + ); + state_regs +} + +#[test] +fn test_parents_16() { + if !crate::platform::avx512_detected() { + return; + } + + // the (untransposed) bytes of 32 concatenated child CVs + let mut child_bytes = [0; 32 * 32]; + crate::test::paint_test_input(&mut child_bytes); + // the same bytes, reinterpreted as words + let child_words: [u32; 32 * 8] = core::array::from_fn(|word| { + u32::from_le_bytes(child_bytes[4 * word..][..4].try_into().unwrap()) + }); + // manually transpose the words into vector layout + let mut left_child_vecs = [[0u32; 16]; 8]; + for cv in 0..16 { + for word in 0..8 { + left_child_vecs[word][cv] = child_words[8 * cv + word]; + } + } + let mut right_child_vecs = [[0u32; 16]; 8]; + for cv in 0..16 { + for word in 0..8 { + right_child_vecs[word][cv] = child_words[8 * (16 + cv) + word]; + } + } + + let left_children: [__m512i; 8] = unsafe { mem::transmute(left_child_vecs) }; + let right_children: [__m512i; 8] = unsafe { mem::transmute(right_child_vecs) }; + let key = [42, 43, 44, 45, 46, 47, 48, 49]; + let outputs = unsafe { + parents_16( + &left_children, + &right_children, + &key, + crate::KEYED_HASH as u32, + ) + }; + let output_words: [[u32; 16]; 8] = unsafe { mem::transmute(outputs) }; + let mut untransposed_output_words = [[0; 8]; 16]; + for vec in 0..8 { + for word in 0..16 { + untransposed_output_words[word][vec] = output_words[vec][word]; + } + } + let untransposed_output_bytes: [u8; 16 * 32] = + unsafe { mem::transmute(untransposed_output_words) }; + + let child_blocks: [&[u8; 64]; 16] = + core::array::from_fn(|block| child_bytes[64 * block..][..64].try_into().unwrap()); + let mut expected = [0u8; 16 * 32]; + crate::portable::hash_many( + &child_blocks, + &key, + 0, + crate::IncrementCounter::No, + crate::PARENT | crate::KEYED_HASH, + 0, + 0, + &mut expected, + ); + assert_eq!(expected, untransposed_output_bytes); } #[target_feature(enable = "avx512f,avx512vl")] |
