diff options
| author | Jack O'Connor <[email protected]> | 2023-08-19 11:51:41 +0800 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2023-08-19 16:48:03 +0800 |
| commit | 2a97ae553d35c1c1b9d5ab024b50d40e47d113ec (patch) | |
| tree | d2452f06abf3828ea1a19a4f0686ee9b99b190ee | |
| parent | 1903d90afc52a939c7027159d2b9b31df0eb11b3 (diff) | |
blake3_guts_riscv64gcv_compress
| -rw-r--r-- | rust/guts/src/riscv64gcv.S | 153 | ||||
| -rw-r--r-- | rust/guts/src/riscv64gcv.rs | 10 |
2 files changed, 162 insertions, 1 deletions
diff --git a/rust/guts/src/riscv64gcv.S b/rust/guts/src/riscv64gcv.S index 324b9ad..9625cd8 100644 --- a/rust/guts/src/riscv64gcv.S +++ b/rust/guts/src/riscv64gcv.S @@ -16,6 +16,159 @@ .section .text +.p2align 2 +IV_VEC: + .word IV0, IV1, IV2, IV3 +ROR1: + .word 3, 0, 1, 2 +ROR2: + .word 2, 3, 0, 1 +ROR3: + .word 1, 2, 3, 0 + +# The bottom half of the load permutation is tweaked to account for the fact that +# we hold the second row fixed during diagonalization. +MSG_LOAD: + .short 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 + +# The message permutation as given in the in the BLAKE3 spec would be the correct +# permutation to use if the load order above was 0, 1, 2, 3... However, since +# we're using a tricky load order, we need to adjust the permutation accordingly. +# The following Python snippet reproduces the permutation we're using here: +# +# load_order = [0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13] +# original_permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8] +# retargeted_permutation = [load_order.index(x) for x in original_permutation] +# shuffled_permutation = [retargeted_permutation[i] for i in load_order] +# print(shuffled_permutation) +MSG_PERMUTE: + .short 1, 5, 7, 2, 3, 10, 0, 15, 12, 4, 11, 13, 9, 14, 6, 8 + +// a0: block (zero-padded to 64 bytes) +// a1: block_len +// a2: cv_bytes +// a3: counter +// a4: flags +// a5: out_ptr +.global blake3_guts_riscv64gcv_compress +blake3_guts_riscv64gcv_compress: + // Load the message load and message permutation indexes. + vsetivli zero, 16, e16, m2, ta, ma + la t0, MSG_LOAD + vle16.v v8, (t0) + la t0, MSG_PERMUTE + vle16.v v10, (t0) + // Load the CV into v0-v1. + vsetivli zero, 16, e8, m1, ta, ma + vle8.v v0, (a2) + addi a2, a2, 16 + vle8.v v1, (a2) + // Set LMUL=4 and load the message block temporarily into scratch + // space. Apply the MSG_LOAD permutation, and then move the permuted + // message words into v4-v7. + // TODO: Do this with less register movement? + li t0, 64 + vsetvli zero, t0, e8, m4, ta, ma + vle8.v v20, (a0) + vsetivli zero, 16, e32, m4, ta, ma + vrgatherei16.vv v16, v20, v8 + vsetivli zero, 4, e32, m4, ta, ma + vslidedown.vi v20, v16, 4 + vslidedown.vi v24, v16, 8 + vslidedown.vi v28, v16, 12 + vsetivli zero, 4, e32, m1, ta, ma + vmv.v.v v4, v16 + vmv.v.v v5, v20 + vmv.v.v v6, v24 + vmv.v.v v7, v28 + // Load the diagonalization gather indexes. + la t0, ROR1 + vle32.v v12, (t0) + la t0, ROR2 + vle32.v v13, (t0) + la t0, ROR3 + vle32.v v14, (t0) + // Load the IV words. + la t0, IV_VEC + vle32.v v2, (t0) + // Load the counter, block_len, and flags. + vsetivli zero, 4, e32, m1, ta, ma + vslide1down.vx v3, v3, a3 + srli a3, a3, 32 + vslide1down.vx v3, v3, a3 + vslide1down.vx v3, v3, a1 + vslide1down.vx v3, v3, a4 + li t0, 7 // round counter +blake3_guts_riscv64gcv_compress_round_loop: + vadd.vv v0, v0, v4 + vadd.vv v0, v0, v1 + vxor.vv v3, v3, v0 + vror.vi v3, v3, 16 + vadd.vv v2, v2, v3 + vxor.vv v1, v1, v2 + vror.vi v1, v1, 12 + vadd.vv v0, v0, v5 + vadd.vv v0, v0, v1 + vxor.vv v3, v3, v0 + vror.vi v3, v3, 8 + vadd.vv v2, v2, v3 + vxor.vv v1, v1, v2 + vror.vi v1, v1, 7 + // Gathers can't overlap a source register, so use v20/v22/v23 in place + // of v0/v2/v3 for this section. + vrgather.vv v20, v0, v12 + vrgather.vv v23, v3, v13 + vrgather.vv v22, v2, v14 + vadd.vv v20, v20, v6 + vadd.vv v20, v20, v1 + vxor.vv v23, v23, v20 + vror.vi v23, v23, 16 + vadd.vv v22, v22, v23 + vxor.vv v1, v1, v22 + vror.vi v1, v1, 12 + vadd.vv v20, v20, v7 + vadd.vv v20, v20, v1 + vxor.vv v23, v23, v20 + vror.vi v23, v23, 8 + vadd.vv v22, v22, v23 + vxor.vv v1, v1, v22 + vror.vi v1, v1, 7 + vrgather.vv v0, v20, v14 + vrgather.vv v3, v23, v13 + vrgather.vv v2, v22, v12 + addi t0, t0, -1 + beqz t0, blake3_guts_riscv64gcv_compress_end + // Shuffle message words. + // TODO: Find a way to do this without so much movement? + vmv.v.v v16, v4 + vmv.v.v v20, v5 + vmv.v.v v24, v6 + vmv.v.v v28, v7 + vsetivli zero, 16, e32, m4, ta, ma + vslideup.vi v16, v20, 4 + vslideup.vi v16, v24, 8 + vslideup.vi v16, v28, 12 + vrgatherei16.vv v28, v16, v10 + vsetivli zero, 4, e32, m4, ta, ma + vslidedown.vi v16, v28, 4 + vslidedown.vi v20, v28, 8 + vslidedown.vi v24, v28, 12 + vsetivli zero, 4, e32, m1, ta, ma + vmv.v.v v4, v28 + vmv.v.v v5, v16 + vmv.v.v v6, v20 + vmv.v.v v7, v24 + j blake3_guts_riscv64gcv_compress_round_loop +blake3_guts_riscv64gcv_compress_end: + vxor.vv v0, v0, v2 + vxor.vv v1, v1, v3 + vsetivli zero, 16, e8, m1, ta, ma + vse8.v v0, (a5) + addi a5, a5, 16 + vse8.v v1, (a5) + ret + + .global blake3_guts_riscv64gcv_degree blake3_guts_riscv64gcv_degree: csrr t0, vlenb diff --git a/rust/guts/src/riscv64gcv.rs b/rust/guts/src/riscv64gcv.rs index 02b7413..4bd1858 100644 --- a/rust/guts/src/riscv64gcv.rs +++ b/rust/guts/src/riscv64gcv.rs @@ -11,6 +11,14 @@ pub(crate) const MAX_SIMD_DEGREE: usize = 16; extern "C" { fn blake3_guts_riscv64gcv_degree() -> usize; + fn blake3_guts_riscv64gcv_compress( + block: *const BlockBytes, + block_len: u32, + cv: *const CVBytes, + counter: u64, + flags: u32, + out: *mut CVBytes, + ); fn blake3_guts_riscv64gcv_hash_chunks( input: *const u8, input_len: usize, @@ -49,7 +57,7 @@ extern "C" { pub fn implementation() -> Implementation { Implementation::new( blake3_guts_riscv64gcv_degree, - crate::portable::compress, + blake3_guts_riscv64gcv_compress, blake3_guts_riscv64gcv_hash_chunks, blake3_guts_riscv64gcv_hash_parents, blake3_guts_riscv64gcv_xof, |
