aboutsummaryrefslogtreecommitdiff
path: root/src/portable.rs
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2019-12-10 14:20:09 -0500
committerJack O'Connor <[email protected]>2019-12-11 18:05:26 -0500
commit52ea6487f88a0e5cbc2f784f3095539afe6c91e4 (patch)
tree181508c1840c2961e530e982c4525029d79e5685 /src/portable.rs
parentd68882da0d897c93a271a7c0f6d6b9b13d13aa16 (diff)
switch to representing CVs as words for the compression function
The portable implementation was getting slowed down by converting back and forth between words and bytes. I made the corresponding change on the C side first (https://github.com/veorq/BLAKE3-c/commit/12a37be8b50922a358c016ba07f46816a3da4a31), and as part of this commit I'm re-vendoring the C code. I'm also exposing a small FFI interface to C so that blake3_neon.c can link against portable.rs rather than blake3_portable.c, see c_neon.rs.
Diffstat (limited to 'src/portable.rs')
-rw-r--r--src/portable.rs133
1 files changed, 65 insertions, 68 deletions
diff --git a/src/portable.rs b/src/portable.rs
index b07c46a..fa0e17d 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -1,4 +1,6 @@
-use crate::{offset_high, offset_low, OffsetDeltas, BLOCK_LEN, IV, KEY_LEN, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+ offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
use arrayref::{array_mut_ref, array_ref};
#[inline(always)]
@@ -31,41 +33,25 @@ fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
}
-pub fn compress(
- cv: &[u8; 32],
+#[inline(always)]
+fn compress_pre(
+ cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
offset: u64,
flags: u8,
-) -> [u8; 64] {
- let block_words = [
- u32::from_le_bytes(*array_ref!(block, 0 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 1 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 2 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 3 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 4 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 5 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 6 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 7 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 8 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 9 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 10 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 11 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 12 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 13 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 14 * 4, 4)),
- u32::from_le_bytes(*array_ref!(block, 15 * 4, 4)),
- ];
+) -> [u32; 16] {
+ let block_words = crate::platform::words_from_le_bytes_64(block);
let mut state = [
- u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4)),
- u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4)),
+ cv[0],
+ cv[1],
+ cv[2],
+ cv[3],
+ cv[4],
+ cv[5],
+ cv[6],
+ cv[7],
IV[0],
IV[1],
IV[2],
@@ -84,6 +70,36 @@ pub fn compress(
round(&mut state, &block_words, 5);
round(&mut state, &block_words, 6);
+ state
+}
+
+pub fn compress_in_place(
+ cv: &mut CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) {
+ let state = compress_pre(cv, block, block_len, offset, flags);
+
+ cv[0] = state[0] ^ state[8];
+ cv[1] = state[1] ^ state[9];
+ cv[2] = state[2] ^ state[10];
+ cv[3] = state[3] ^ state[11];
+ cv[4] = state[4] ^ state[12];
+ cv[5] = state[5] ^ state[13];
+ cv[6] = state[6] ^ state[14];
+ cv[7] = state[7] ^ state[15];
+}
+
+pub fn compress_xof(
+ cv: &CVWords,
+ block: &[u8; BLOCK_LEN],
+ block_len: u8,
+ offset: u64,
+ flags: u8,
+) -> [u8; 64] {
+ let mut state = compress_pre(cv, block, block_len, offset, flags);
state[0] ^= state[8];
state[1] ^= state[9];
state[2] ^= state[10];
@@ -92,43 +108,25 @@ pub fn compress(
state[5] ^= state[13];
state[6] ^= state[14];
state[7] ^= state[15];
- state[8] ^= u32::from_le_bytes(*array_ref!(cv, 0 * 4, 4));
- state[9] ^= u32::from_le_bytes(*array_ref!(cv, 1 * 4, 4));
- state[10] ^= u32::from_le_bytes(*array_ref!(cv, 2 * 4, 4));
- state[11] ^= u32::from_le_bytes(*array_ref!(cv, 3 * 4, 4));
- state[12] ^= u32::from_le_bytes(*array_ref!(cv, 4 * 4, 4));
- state[13] ^= u32::from_le_bytes(*array_ref!(cv, 5 * 4, 4));
- state[14] ^= u32::from_le_bytes(*array_ref!(cv, 6 * 4, 4));
- state[15] ^= u32::from_le_bytes(*array_ref!(cv, 7 * 4, 4));
-
- let mut out = [0; 64];
- out[0 * 4..][..4].copy_from_slice(&state[0].to_le_bytes());
- out[1 * 4..][..4].copy_from_slice(&state[1].to_le_bytes());
- out[2 * 4..][..4].copy_from_slice(&state[2].to_le_bytes());
- out[3 * 4..][..4].copy_from_slice(&state[3].to_le_bytes());
- out[4 * 4..][..4].copy_from_slice(&state[4].to_le_bytes());
- out[5 * 4..][..4].copy_from_slice(&state[5].to_le_bytes());
- out[6 * 4..][..4].copy_from_slice(&state[6].to_le_bytes());
- out[7 * 4..][..4].copy_from_slice(&state[7].to_le_bytes());
- out[8 * 4..][..4].copy_from_slice(&state[8].to_le_bytes());
- out[9 * 4..][..4].copy_from_slice(&state[9].to_le_bytes());
- out[10 * 4..][..4].copy_from_slice(&state[10].to_le_bytes());
- out[11 * 4..][..4].copy_from_slice(&state[11].to_le_bytes());
- out[12 * 4..][..4].copy_from_slice(&state[12].to_le_bytes());
- out[13 * 4..][..4].copy_from_slice(&state[13].to_le_bytes());
- out[14 * 4..][..4].copy_from_slice(&state[14].to_le_bytes());
- out[15 * 4..][..4].copy_from_slice(&state[15].to_le_bytes());
- out
+ state[8] ^= cv[0];
+ state[9] ^= cv[1];
+ state[10] ^= cv[2];
+ state[11] ^= cv[3];
+ state[12] ^= cv[4];
+ state[13] ^= cv[5];
+ state[14] ^= cv[6];
+ state[15] ^= cv[7];
+ crate::platform::le_bytes_from_words_64(&state)
}
pub fn hash1<A: arrayvec::Array<Item = u8>>(
input: &A,
- key: &[u8; KEY_LEN],
+ key: &CVWords,
offset: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
- out: &mut [u8; OUT_LEN],
+ out: &mut CVBytes,
) {
debug_assert_eq!(A::CAPACITY % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
@@ -138,23 +136,22 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
- let output = compress(
- &cv,
+ compress_in_place(
+ &mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
offset,
block_flags,
);
- cv = *array_ref!(output, 0, 32);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
- *out = cv;
+ *out = crate::platform::le_bytes_from_words_32(&cv);
}
pub fn hash_many<A: arrayvec::Array<Item = u8>>(
inputs: &[&A],
- key: &[u8; KEY_LEN],
+ key: &CVWords,
mut offset: u64,
offset_deltas: &OffsetDeltas,
flags: u8,
@@ -182,12 +179,12 @@ pub mod test {
use super::*;
// This is basically testing the portable implementation against itself,
- // but we do it anyway for completeness. Other implementations will test
- // themselves against portable. We also have several tests against the
- // reference implementation in test.rs.
+ // but it also checks that compress_in_place and compress_xof are
+ // consistent. And there are tests against the reference implementation and
+ // against hardcoded test vectors elsewhere.
#[test]
fn test_compress() {
- crate::test::test_compress_fn(compress);
+ crate::test::test_compress_fn(compress_in_place, compress_xof);
}
// Ditto.