diff options
| author | Jack O'Connor <[email protected]> | 2022-03-09 11:21:13 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-03-09 11:21:13 -0500 |
| commit | 506ae0b0fe255c00c69c3ca6a6388e0a20eebe40 (patch) | |
| tree | d013217aed9834a7d12dea8f7bf24949a72ae8ff | |
| parent | deac82543627cf79010b58cf1472c8261456dcbf (diff) | |
move third row initialization into blake3_avx512_kernel_16
| -rw-r--r-- | src/kernel.rs | 58 |
1 files changed, 30 insertions, 28 deletions
diff --git a/src/kernel.rs b/src/kernel.rs index 7542565..ae5712e 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -5,13 +5,36 @@ global_asm!( // -------------------------------------------------------------------------------------------- // blake3_avx512_kernel_16 // - // zmm0-zmm15: state vectors + // zmm0-zmm7: transposed input CV (which may be the key or the IV) + // zmm12: transposed lower order counter words + // zmm13: transposed higher order counter words + // zmm14: transposed block sizes (all 64) + // zmm15: transposed flag words // zmm16-zmm31: transposed message vectors // - // This routine executes all 7 rounds of compression and performs the XOR of the upper half of - // the state into the lower half (but not the feed-forward). The result is left in zmm0-zmm7. + // This routine overwrites zmm8-zmm11 (the third row of the state) with IV bytes, executes all + // 7 rounds of compression, and performs the XOR of the upper half of the state into the lower + // half (but not the feed-forward). The result is left in zmm0-zmm7. // -------------------------------------------------------------------------------------------- + ".p2align 6", + "BLAKE3_IV0_16:", + ".long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667", + ".long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667", + "BLAKE3_IV1_16:", + ".long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85", + ".long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85", + "BLAKE3_IV2_16:", + ".long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372", + ".long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372", + "BLAKE3_IV3_16:", + ".long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A", + ".long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A", "blake3_avx512_kernel_16:", + // load IV constants into the third row + "vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", + "vmovdqa32 zmm9, zmmword ptr [BLAKE3_IV1_16 + rip]", + "vmovdqa32 zmm10, zmmword ptr [BLAKE3_IV2_16 + rip]", + "vmovdqa32 zmm11, zmmword ptr [BLAKE3_IV3_16 + rip]", // round 1 "vpaddd zmm0, zmm0, zmm16", "vpaddd zmm1, zmm1, zmm18", @@ -951,12 +974,8 @@ global_asm!( "vpunpckhqdq zmm29, zmm30, zmm13", "vpunpcklqdq zmm30, zmm31, zmm14", "vpunpckhqdq zmm31, zmm31, zmm14", - // Initialize the third and fourth rows of the state, part of which we just used as scratch - // space during transposition. - "vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", // IV constants - "vmovdqa32 zmm9, zmmword ptr [BLAKE3_IV1_16 + rip]", - "vmovdqa32 zmm10, zmmword ptr [BLAKE3_IV2_16 + rip]", - "vmovdqa32 zmm11, zmmword ptr [BLAKE3_IV3_16 + rip]", + // Initialize fourth row of the state, part of which we just used as scratch space during + // transposition. "vmovdqa32 zmm12, zmmword ptr [rdx + 64 * 0]", // counter low "vmovdqa32 zmm13, zmmword ptr [rdx + 64 * 1]", // counter high "vpbroadcastd zmm14, ecx", // block length (always 64) @@ -1135,11 +1154,7 @@ global_asm!( "vpbroadcastd zmm5, dword ptr [rdx + 5 * 4]", "vpbroadcastd zmm6, dword ptr [rdx + 6 * 4]", "vpbroadcastd zmm7, dword ptr [rdx + 7 * 4]", - // Initialize the third and fourth rows of the state. - "vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", // IV constants - "vmovdqa32 zmm9, zmmword ptr [BLAKE3_IV1_16 + rip]", - "vmovdqa32 zmm10, zmmword ptr [BLAKE3_IV2_16 + rip]", - "vmovdqa32 zmm11, zmmword ptr [BLAKE3_IV3_16 + rip]", + // Initialize the fourth row of the state. "xor ecx, ecx", // zero "vpbroadcastd zmm12, ecx", // counter low (always 0) "vpbroadcastd zmm13, ecx", // counter high (always 0) @@ -1185,11 +1200,7 @@ global_asm!( "vpbroadcastd zmm5, dword ptr [rsi + 5 * 4]", "vpbroadcastd zmm6, dword ptr [rsi + 6 * 4]", "vpbroadcastd zmm7, dword ptr [rsi + 7 * 4]", - // Initialize zmm8-zmm15, the third and fourth rows of the state. - "vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", // IV constants - "vmovdqa32 zmm9, zmmword ptr [BLAKE3_IV1_16 + rip]", - "vmovdqa32 zmm10, zmmword ptr [BLAKE3_IV2_16 + rip]", - "vmovdqa32 zmm11, zmmword ptr [BLAKE3_IV3_16 + rip]", + // Initialize zmm12-zmm15, fourth row of the state. "vmovdqa32 zmm12, zmmword ptr [rdx + 64 * 0]", // counter low "vmovdqa32 zmm13, zmmword ptr [rdx + 64 * 1]", // counter high "mov ecx, 64", @@ -1355,15 +1366,6 @@ global_asm!( #[derive(Copy, Clone, Debug)] pub struct Words16(pub [u32; 16]); -#[no_mangle] -static BLAKE3_IV0_16: Words16 = Words16([crate::IV[0]; 16]); -#[no_mangle] -static BLAKE3_IV1_16: Words16 = Words16([crate::IV[1]; 16]); -#[no_mangle] -static BLAKE3_IV2_16: Words16 = Words16([crate::IV[2]; 16]); -#[no_mangle] -static BLAKE3_IV3_16: Words16 = Words16([crate::IV[3]; 16]); - pub unsafe fn chunks16( message: &[u8; 16 * CHUNK_LEN], key: &[u32; 8], |
