aboutsummaryrefslogtreecommitdiff
path: root/src/kernel.rs
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2022-04-01 15:43:04 -0400
committerJack O'Connor <[email protected]>2022-04-09 13:31:19 -0700
commite17743e8fdf2845be6dc85ad339bf45feeefc564 (patch)
tree705b7204f9522a666476ad38f49393c31e11a2de /src/kernel.rs
parent35ad4ededdbf259c507c49b2e7ac529b43b61671 (diff)
kernel_3d_16 and xof functionskernel
Diffstat (limited to 'src/kernel.rs')
-rw-r--r--src/kernel.rs37
1 files changed, 31 insertions, 6 deletions
diff --git a/src/kernel.rs b/src/kernel.rs
index cdcb25a..0d7a5b3 100644
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -73,6 +73,14 @@ extern "C" {
flags: u32,
out: *mut [u8; 64 * 4],
);
+ pub fn blake3_avx512_xof_stream_16(
+ cv: &[u32; 8],
+ block: &[u8; 64],
+ counter: u64,
+ block_len: u32,
+ flags: u32,
+ out: *mut [u8; 64 * 16],
+ );
pub fn blake3_sse2_xof_xor_1(
cv: &[u32; 8],
block: &[u8; 64],
@@ -121,6 +129,14 @@ extern "C" {
flags: u32,
out: &mut [u8; 64 * 4],
);
+ pub fn blake3_avx512_xof_xor_16(
+ cv: &[u32; 8],
+ block: &[u8; 64],
+ counter: u64,
+ block_len: u32,
+ flags: u32,
+ out: &mut [u8; 64 * 16],
+ );
}
pub type CompressionFn =
@@ -311,6 +327,15 @@ mod test {
}
test_xof_functions(blake3_avx512_xof_stream_4, blake3_avx512_xof_xor_4);
}
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ fn test_avx512_xof_16() {
+ if !is_x86_feature_detected!("avx512f") || !is_x86_feature_detected!("avx512vl") {
+ return;
+ }
+ test_xof_functions(blake3_avx512_xof_stream_16, blake3_avx512_xof_xor_16);
+ }
}
global_asm!(
@@ -2618,7 +2643,7 @@ global_asm!(
"ret",
//
// --------------------------------------------------------------------------------------------
- // blake3_avx512_xof_stream_16
+ // blake3_avx512_xof_stream_16_ORIGINAL
//
// zmm0-zmm31: [clobbered]
// rdi: pointer to the 16-word message block, 4-byte aligned
@@ -2631,7 +2656,7 @@ global_asm!(
// This routine performs the root compression for 16 consecutive output blocks and writes 1024
// bytes of output to the out pointer.
// --------------------------------------------------------------------------------------------
- "blake3_avx512_xof_stream_16:",
+ "blake3_avx512_xof_stream_16_ORIGINAL:",
// Broadcast the input CV into zmm0-zmm7, the first two rows of the state.
"vpbroadcastd zmm0, dword ptr [rsi + 0 * 4]",
"vpbroadcastd zmm1, dword ptr [rsi + 1 * 4]",
@@ -2801,7 +2826,7 @@ global_asm!(
"ret",
//
// --------------------------------------------------------------------------------------------
- // blake3_avx512_xof_xor_16
+ // blake3_avx512_xof_xor_16_ORIGINAL
//
// zmm0-zmm31: [clobbered]
// rdi: pointer to the 16-word message block, 4-byte aligned
@@ -2814,7 +2839,7 @@ global_asm!(
// This routine performs the root compression for 16 consecutive output blocks and xor's 1024
// bytes of output into the inout pointer.
// --------------------------------------------------------------------------------------------
- "blake3_avx512_xof_xor_16:",
+ "blake3_avx512_xof_xor_16_ORIGINAL:",
// Broadcast the input CV into zmm0-zmm7, the first two rows of the state.
"vpbroadcastd zmm0, dword ptr [rsi + 0 * 4]",
"vpbroadcastd zmm1, dword ptr [rsi + 1 * 4]",
@@ -3143,7 +3168,7 @@ pub unsafe fn xof_stream16(
counter_vectors[1].0[i] = ((counter + i as u64) >> 32) as u32;
}
asm!(
- "call blake3_avx512_xof_stream_16",
+ "call blake3_avx512_xof_stream_16_ORIGINAL",
inout("rdi") message_words => _,
inout("rsi") cv_words => _,
inout("rdx") &counter_vectors => _,
@@ -3176,7 +3201,7 @@ pub unsafe fn xof_xor16(
counter_vectors[1].0[i] = ((counter + i as u64) >> 32) as u32;
}
asm!(
- "call blake3_avx512_xof_xor_16",
+ "call blake3_avx512_xof_xor_16_ORIGINAL",
inout("rdi") message_words => _,
inout("rsi") cv_words => _,
inout("rdx") &counter_vectors => _,