diff options
| author | Jack O'Connor <[email protected]> | 2022-03-08 22:41:27 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-03-08 22:41:27 -0500 |
| commit | 5d4655920151a941b997bc0c59a86d493f7e3548 (patch) | |
| tree | 46cc530c0769e6abacf0ede32a12c128e3c2c5fe | |
| parent | 4e8ae445c4233dc2aa3c4814af12028fb3619f3d (diff) | |
split the left and right child CVs for blake3_avx512_parents_16
There's no reason to force the caller to allocate them together.
| -rw-r--r-- | benches/bench.rs | 19 | ||||
| -rw-r--r-- | src/kernel.rs | 57 |
2 files changed, 44 insertions, 32 deletions
diff --git a/benches/bench.rs b/benches/bench.rs index 2fccba1..4aa62aa 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -252,17 +252,26 @@ fn bench_many_parents_neon(b: &mut Bencher) { #[bench] fn bench_many_parents_kernel(b: &mut Bencher) { - let alignment = std::mem::align_of::<blake3::kernel::Words16>(); + use blake3::kernel::Words16; + let size = 16 * std::mem::size_of::<Words16>(); + let alignment = std::mem::align_of::<Words16>(); assert_eq!(alignment, 64); - let mut input = RandomInput::new_aligned(b, 16 * 16 * 4, 64); + let mut input = RandomInput::new_aligned(b, size, alignment); for _ in 0..100 { assert_eq!(0, (input.get().as_ptr() as usize) % alignment); } let mut output = [blake3::kernel::Words16([0; 16]); 8]; b.iter(|| unsafe { - let rand_bytes = input.get(); - let rand_vectors = &*(rand_bytes.as_ptr() as *const [blake3::kernel::Words16; 16]); - blake3::kernel::parents16(&rand_vectors, &[0; 8], 0, &mut output); + let rand_ptr = input.get().as_ptr(); + let rand_left_children = &*(rand_ptr as *const [Words16; 8]); + let rand_right_children = &*(rand_ptr.add(size / 2) as *const [Words16; 8]); + blake3::kernel::parents16( + &rand_left_children, + &rand_right_children, + &[0; 8], + 0, + &mut output, + ); }); } diff --git a/src/kernel.rs b/src/kernel.rs index 4abdb76..3f5498d 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -1049,9 +1049,9 @@ global_asm!( // blake3_avx512_parents_16 // // zmm0-zmm31: [clobbered] - // rdi: pointer to 16 transposed state vectors, 8 left and 8 right, 64-byte aligned - // rsi: pointer to the 32-byte key, unaligned - // rdx: [unused] + // rdi: pointer to the left child CVs, 8 transposed state vectors, 64-byte aligned + // rsi: pointer to the right child CVs, 8 transposed state vectors, 64-byte aligned + // rdx: pointer to the 32-byte key, unaligned // ecx: [clobbered] // r8d: flags (other than PARENT) // r9: out pointer to 8x64 bytes, 64-byte aligned @@ -1088,53 +1088,53 @@ global_asm!( "vmovdqa32 zmm1, zmmword ptr [rip + BLAKE3_AVX512_ODD_INDEXES]", "vmovdqa32 zmm16, zmmword ptr [rdi + 0 * 64]", "vmovdqa32 zmm24, zmm16", - "vmovdqa32 zmm2, zmmword ptr [rdi + 8 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 0 * 64]", "vpermt2d zmm16, zmm0, zmm2", "vpermt2d zmm24, zmm1, zmm2", "vmovdqa32 zmm17, zmmword ptr [rdi + 1 * 64]", "vmovdqa32 zmm25, zmm17", - "vmovdqa32 zmm2, zmmword ptr [rdi + 9 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 1 * 64]", "vpermt2d zmm17, zmm0, zmm2", "vpermt2d zmm25, zmm1, zmm2", "vmovdqa32 zmm18, zmmword ptr [rdi + 2 * 64]", "vmovdqa32 zmm26, zmm18", - "vmovdqa32 zmm2, zmmword ptr [rdi + 10 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 2 * 64]", "vpermt2d zmm18, zmm0, zmm2", "vpermt2d zmm26, zmm1, zmm2", "vmovdqa32 zmm19, zmmword ptr [rdi + 3 * 64]", "vmovdqa32 zmm27, zmm19", - "vmovdqa32 zmm2, zmmword ptr [rdi + 11 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 3 * 64]", "vpermt2d zmm19, zmm0, zmm2", "vpermt2d zmm27, zmm1, zmm2", "vmovdqa32 zmm20, zmmword ptr [rdi + 4 * 64]", "vmovdqa32 zmm28, zmm20", - "vmovdqa32 zmm2, zmmword ptr [rdi + 12 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 4 * 64]", "vpermt2d zmm20, zmm0, zmm2", "vpermt2d zmm28, zmm1, zmm2", "vmovdqa32 zmm21, zmmword ptr [rdi + 5 * 64]", "vmovdqa32 zmm29, zmm21", - "vmovdqa32 zmm2, zmmword ptr [rdi + 13 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 5 * 64]", "vpermt2d zmm21, zmm0, zmm2", "vpermt2d zmm29, zmm1, zmm2", "vmovdqa32 zmm22, zmmword ptr [rdi + 6 * 64]", "vmovdqa32 zmm30, zmm22", - "vmovdqa32 zmm2, zmmword ptr [rdi + 14 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 6 * 64]", "vpermt2d zmm22, zmm0, zmm2", "vpermt2d zmm30, zmm1, zmm2", "vmovdqa32 zmm23, zmmword ptr [rdi + 7 * 64]", "vmovdqa32 zmm31, zmm23", - "vmovdqa32 zmm2, zmmword ptr [rdi + 15 * 64]", + "vmovdqa32 zmm2, zmmword ptr [rsi + 7 * 64]", "vpermt2d zmm23, zmm0, zmm2", "vpermt2d zmm31, zmm1, zmm2", // Broadcast the key into zmm0-zmm7. - "vpbroadcastd zmm0, dword ptr [rsi + 0 * 4]", - "vpbroadcastd zmm1, dword ptr [rsi + 1 * 4]", - "vpbroadcastd zmm2, dword ptr [rsi + 2 * 4]", - "vpbroadcastd zmm3, dword ptr [rsi + 3 * 4]", - "vpbroadcastd zmm4, dword ptr [rsi + 4 * 4]", - "vpbroadcastd zmm5, dword ptr [rsi + 5 * 4]", - "vpbroadcastd zmm6, dword ptr [rsi + 6 * 4]", - "vpbroadcastd zmm7, dword ptr [rsi + 7 * 4]", + "vpbroadcastd zmm0, dword ptr [rdx + 0 * 4]", + "vpbroadcastd zmm1, dword ptr [rdx + 1 * 4]", + "vpbroadcastd zmm2, dword ptr [rdx + 2 * 4]", + "vpbroadcastd zmm3, dword ptr [rdx + 3 * 4]", + "vpbroadcastd zmm4, dword ptr [rdx + 4 * 4]", + "vpbroadcastd zmm5, dword ptr [rdx + 5 * 4]", + "vpbroadcastd zmm6, dword ptr [rdx + 6 * 4]", + "vpbroadcastd zmm7, dword ptr [rdx + 7 * 4]", // Initialize the third and fourth rows of the state. "vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", // IV constants "vmovdqa32 zmm9, zmmword ptr [BLAKE3_IV1_16 + rip]", @@ -1208,16 +1208,17 @@ pub unsafe fn chunks16( } pub unsafe fn parents16( - child_cvs: &[Words16; 16], + left_child_cvs: &[Words16; 8], + right_child_cvs: &[Words16; 8], key: &[u32; 8], flags: u32, out_ptr: *mut [Words16; 8], ) { asm!( "call blake3_avx512_parents_16", - inout("rdi") child_cvs => _, - inout("rsi") key => _, - out("rdx") _, + inout("rdi") left_child_cvs => _, + inout("rsi") right_child_cvs => _, + inout("rdx") key => _, out("ecx") _, inout("r8d") flags => _, inout("r9") out_ptr => _, @@ -1298,7 +1299,8 @@ fn test_parents16() { } // 8 transposed left child CVs and 8 transposed right child CVs - let mut transposed_child_cvs = [Words16([0; 16]); 16]; + let mut transposed_left_child_cvs = [Words16([0; 16]); 8]; + let mut transposed_right_child_cvs = [Words16([0; 16]); 8]; for child_i in 0..16 { for word_i in 0..8 { let word = u32::from_le_bytes( @@ -1306,7 +1308,7 @@ fn test_parents16() { .try_into() .unwrap(), ); - transposed_child_cvs[word_i].0[child_i] = word; + transposed_left_child_cvs[word_i].0[child_i] = word; } } for child_i in 16..32 { @@ -1316,13 +1318,14 @@ fn test_parents16() { .try_into() .unwrap(), ); - transposed_child_cvs[8 + word_i].0[child_i - 16] = word; + transposed_right_child_cvs[word_i].0[child_i - 16] = word; } } let mut found_out = [Words16([0; 16]); 8]; unsafe { parents16( - &transposed_child_cvs, + &transposed_left_child_cvs, + &transposed_right_child_cvs, crate::IV, flags as u32, &mut found_out, |
