diff options
| author | Jack O'Connor <[email protected]> | 2022-03-08 13:00:13 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-03-08 22:23:09 -0500 |
| commit | bcbbcc8d2c61e2653d3434b049e3c7d857984bd3 (patch) | |
| tree | dc35fcbbe1dfd1f106d96e5708013e7b8bb7e1c5 | |
| parent | 78b8e87f91a1c9253a8de2f088d09f5d0c15d990 (diff) | |
now using only 3 scratch zmm registers
| -rw-r--r-- | src/kernel.rs | 193 |
1 files changed, 96 insertions, 97 deletions
diff --git a/src/kernel.rs b/src/kernel.rs index 09860e4..6d06186 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -828,103 +828,102 @@ global_asm!( // and invokes blake3_avx512_kernel_16. // -------------------------------------------------------------------------------------------- "blake3_avx512_blocks_16:", - "vmovdqu32 xmm8, xmmword ptr [rdi + 0 * 16 + 0 * 1024]", - "vinserti32x4 zmm8, zmm8, xmmword ptr [rdi + 0 * 16 + 4 * 1024], 1", - "vinserti32x4 zmm8, zmm8, xmmword ptr [rdi + 0 * 16 + 8 * 1024], 2", - "vinserti32x4 zmm8, zmm8, xmmword ptr [rdi + 0 * 16 + 12 * 1024], 3", - "vmovdqu32 xmm9, xmmword ptr [rdi + 0 * 16 + 1 * 1024]", - "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 5 * 1024], 1", - "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 9 * 1024], 2", - "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 13 * 1024], 3", - "vpunpckldq zmm24, zmm8, zmm9", - "vpunpckhdq zmm25, zmm8, zmm9", - "vmovdqu32 xmm10, xmmword ptr [rdi + 0 * 16 + 2 * 1024]", - "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 6 * 1024], 1", - "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 10 * 1024], 2", - "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 14 * 1024], 3", - "vmovdqu32 xmm11, xmmword ptr [rdi + 0 * 16 + 3 * 1024]", - "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 7 * 1024], 1", - "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 11 * 1024], 2", - "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 15 * 1024], 3", - "vpunpckldq zmm26, zmm10, zmm11", - "vpunpckhdq zmm27, zmm10, zmm11", - "vmovdqu32 xmm12, xmmword ptr [rdi + 1 * 16 + 0 * 1024]", - "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 4 * 1024], 1", - "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 8 * 1024], 2", - "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 12 * 1024], 3", - "vmovdqu32 xmm13, xmmword ptr [rdi + 1 * 16 + 1 * 1024]", - "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 5 * 1024], 1", - "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 9 * 1024], 2", - "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 13 * 1024], 3", - "vpunpckldq zmm28, zmm12, zmm13", - "vpunpckhdq zmm29, zmm12, zmm13", - "vmovdqu32 xmm14, xmmword ptr [rdi + 1 * 16 + 2 * 1024]", - "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 6 * 1024], 1", - "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 10 * 1024], 2", - "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 14 * 1024], 3", - "vmovdqu32 xmm15, xmmword ptr [rdi + 1 * 16 + 3 * 1024]", - "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 7 * 1024], 1", - "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 11 * 1024], 2", - "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 15 * 1024], 3", - "vpunpckldq zmm30, zmm14, zmm15", - "vpunpckhdq zmm31, zmm14, zmm15", - "vmovdqu32 xmm16, xmmword ptr [rdi + 2 * 16 + 0 * 1024]", - "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 4 * 1024], 1", - "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 8 * 1024], 2", - "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 12 * 1024], 3", - "vmovdqu32 xmm17, xmmword ptr [rdi + 2 * 16 + 1 * 1024]", - "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 5 * 1024], 1", - "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 9 * 1024], 2", - "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 13 * 1024], 3", - "vpunpckldq zmm8, zmm16, zmm17", - "vpunpckhdq zmm9, zmm16, zmm17", - "vmovdqu32 xmm18, xmmword ptr [rdi + 2 * 16 + 2 * 1024]", - "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 6 * 1024], 1", - "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 10 * 1024], 2", - "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 14 * 1024], 3", - "vmovdqu32 xmm19, xmmword ptr [rdi + 2 * 16 + 3 * 1024]", - "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 7 * 1024], 1", - "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 11 * 1024], 2", - "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 15 * 1024], 3", - "vpunpckldq zmm10, zmm18, zmm19", - "vpunpckhdq zmm11, zmm18, zmm19", - "vmovdqu32 xmm20, xmmword ptr [rdi + 3 * 16 + 0 * 1024]", - "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 4 * 1024], 1", - "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 8 * 1024], 2", - "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 12 * 1024], 3", - "vmovdqu32 xmm21, xmmword ptr [rdi + 3 * 16 + 1 * 1024]", - "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 5 * 1024], 1", - "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 9 * 1024], 2", - "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 13 * 1024], 3", - "vpunpckldq zmm12, zmm20, zmm21", - "vpunpckhdq zmm13, zmm20, zmm21", - "vmovdqu32 xmm22, xmmword ptr [rdi + 3 * 16 + 2 * 1024]", - "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 6 * 1024], 1", - "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 10 * 1024], 2", - "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 14 * 1024], 3", - "vmovdqu32 xmm23, xmmword ptr [rdi + 3 * 16 + 3 * 1024]", - "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 7 * 1024], 1", - "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 11 * 1024], 2", - "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 15 * 1024], 3", - "vpunpckldq zmm14, zmm22, zmm23", - "vpunpckhdq zmm15, zmm22, zmm23", - // interleave 64-bit words - "vpunpcklqdq zmm16, zmm24, zmm26", - "vpunpckhqdq zmm17, zmm24, zmm26", - "vpunpcklqdq zmm18, zmm25, zmm27", - "vpunpckhqdq zmm19, zmm25, zmm27", - "vpunpcklqdq zmm20, zmm28, zmm30", - "vpunpckhqdq zmm21, zmm28, zmm30", - "vpunpcklqdq zmm22, zmm29, zmm31", - "vpunpckhqdq zmm23, zmm29, zmm31", - "vpunpcklqdq zmm24, zmm8, zmm10", - "vpunpckhqdq zmm25, zmm8, zmm10", - "vpunpcklqdq zmm26, zmm9, zmm11", - "vpunpckhqdq zmm27, zmm9, zmm11", - "vpunpcklqdq zmm28, zmm12, zmm14", - "vpunpckhqdq zmm29, zmm12, zmm14", - "vpunpcklqdq zmm30, zmm13, zmm15", - "vpunpckhqdq zmm31, zmm13, zmm15", + "vmovdqu32 xmm19, xmmword ptr [rdi + 0 * 16 + 0 * 1024]", + "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 0 * 16 + 4 * 1024], 1", + "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 0 * 16 + 8 * 1024], 2", + "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 0 * 16 + 12 * 1024], 3", + "vmovdqu32 xmm20, xmmword ptr [rdi + 0 * 16 + 1 * 1024]", + "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 0 * 16 + 5 * 1024], 1", + "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 0 * 16 + 9 * 1024], 2", + "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 0 * 16 + 13 * 1024], 3", + "vpunpckldq zmm18, zmm19, zmm20", + "vpunpckhdq zmm19, zmm19, zmm20", + "vmovdqu32 xmm21, xmmword ptr [rdi + 0 * 16 + 2 * 1024]", + "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 0 * 16 + 6 * 1024], 1", + "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 0 * 16 + 10 * 1024], 2", + "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 0 * 16 + 14 * 1024], 3", + "vmovdqu32 xmm22, xmmword ptr [rdi + 0 * 16 + 3 * 1024]", + "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 0 * 16 + 7 * 1024], 1", + "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 0 * 16 + 11 * 1024], 2", + "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 0 * 16 + 15 * 1024], 3", + "vpunpckldq zmm20, zmm21, zmm22", + "vpunpckhdq zmm21, zmm21, zmm22", + "vpunpcklqdq zmm16, zmm18, zmm20", + "vpunpckhqdq zmm17, zmm18, zmm20", + "vpunpcklqdq zmm18, zmm19, zmm21", + "vpunpckhqdq zmm19, zmm19, zmm21", + "vmovdqu32 xmm23, xmmword ptr [rdi + 1 * 16 + 0 * 1024]", + "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 1 * 16 + 4 * 1024], 1", + "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 1 * 16 + 8 * 1024], 2", + "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 1 * 16 + 12 * 1024], 3", + "vmovdqu32 xmm24, xmmword ptr [rdi + 1 * 16 + 1 * 1024]", + "vinserti32x4 zmm24, zmm24, xmmword ptr [rdi + 1 * 16 + 5 * 1024], 1", + "vinserti32x4 zmm24, zmm24, xmmword ptr [rdi + 1 * 16 + 9 * 1024], 2", + "vinserti32x4 zmm24, zmm24, xmmword ptr [rdi + 1 * 16 + 13 * 1024], 3", + "vpunpckldq zmm22, zmm23, zmm24", + "vpunpckhdq zmm23, zmm23, zmm24", + "vmovdqu32 xmm25, xmmword ptr [rdi + 1 * 16 + 2 * 1024]", + "vinserti32x4 zmm25, zmm25, xmmword ptr [rdi + 1 * 16 + 6 * 1024], 1", + "vinserti32x4 zmm25, zmm25, xmmword ptr [rdi + 1 * 16 + 10 * 1024], 2", + "vinserti32x4 zmm25, zmm25, xmmword ptr [rdi + 1 * 16 + 14 * 1024], 3", + "vmovdqu32 xmm26, xmmword ptr [rdi + 1 * 16 + 3 * 1024]", + "vinserti32x4 zmm26, zmm26, xmmword ptr [rdi + 1 * 16 + 7 * 1024], 1", + "vinserti32x4 zmm26, zmm26, xmmword ptr [rdi + 1 * 16 + 11 * 1024], 2", + "vinserti32x4 zmm26, zmm26, xmmword ptr [rdi + 1 * 16 + 15 * 1024], 3", + "vpunpckldq zmm24, zmm25, zmm26", + "vpunpckhdq zmm25, zmm25, zmm26", + "vpunpcklqdq zmm20, zmm22, zmm24", + "vpunpckhqdq zmm21, zmm22, zmm24", + "vpunpcklqdq zmm22, zmm23, zmm25", + "vpunpckhqdq zmm23, zmm23, zmm25", + "vmovdqu32 xmm27, xmmword ptr [rdi + 2 * 16 + 0 * 1024]", + "vinserti32x4 zmm27, zmm27, xmmword ptr [rdi + 2 * 16 + 4 * 1024], 1", + "vinserti32x4 zmm27, zmm27, xmmword ptr [rdi + 2 * 16 + 8 * 1024], 2", + "vinserti32x4 zmm27, zmm27, xmmword ptr [rdi + 2 * 16 + 12 * 1024], 3", + "vmovdqu32 xmm28, xmmword ptr [rdi + 2 * 16 + 1 * 1024]", + "vinserti32x4 zmm28, zmm28, xmmword ptr [rdi + 2 * 16 + 5 * 1024], 1", + "vinserti32x4 zmm28, zmm28, xmmword ptr [rdi + 2 * 16 + 9 * 1024], 2", + "vinserti32x4 zmm28, zmm28, xmmword ptr [rdi + 2 * 16 + 13 * 1024], 3", + "vpunpckldq zmm26, zmm27, zmm28", + "vpunpckhdq zmm27, zmm27, zmm28", + "vmovdqu32 xmm29, xmmword ptr [rdi + 2 * 16 + 2 * 1024]", + "vinserti32x4 zmm29, zmm29, xmmword ptr [rdi + 2 * 16 + 6 * 1024], 1", + "vinserti32x4 zmm29, zmm29, xmmword ptr [rdi + 2 * 16 + 10 * 1024], 2", + "vinserti32x4 zmm29, zmm29, xmmword ptr [rdi + 2 * 16 + 14 * 1024], 3", + "vmovdqu32 xmm30, xmmword ptr [rdi + 2 * 16 + 3 * 1024]", + "vinserti32x4 zmm30, zmm30, xmmword ptr [rdi + 2 * 16 + 7 * 1024], 1", + "vinserti32x4 zmm30, zmm30, xmmword ptr [rdi + 2 * 16 + 11 * 1024], 2", + "vinserti32x4 zmm30, zmm30, xmmword ptr [rdi + 2 * 16 + 15 * 1024], 3", + "vpunpckldq zmm28, zmm29, zmm30", + "vpunpckhdq zmm29, zmm29, zmm30", + "vpunpcklqdq zmm24, zmm26, zmm28", + "vpunpckhqdq zmm25, zmm26, zmm28", + "vpunpcklqdq zmm26, zmm27, zmm29", + "vpunpckhqdq zmm27, zmm27, zmm29", + "vmovdqu32 xmm31, xmmword ptr [rdi + 3 * 16 + 0 * 1024]", + "vinserti32x4 zmm31, zmm31, xmmword ptr [rdi + 3 * 16 + 4 * 1024], 1", + "vinserti32x4 zmm31, zmm31, xmmword ptr [rdi + 3 * 16 + 8 * 1024], 2", + "vinserti32x4 zmm31, zmm31, xmmword ptr [rdi + 3 * 16 + 12 * 1024], 3", + "vmovdqu32 xmm13, xmmword ptr [rdi + 3 * 16 + 1 * 1024]", + "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 3 * 16 + 5 * 1024], 1", + "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 3 * 16 + 9 * 1024], 2", + "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 3 * 16 + 13 * 1024], 3", + "vpunpckldq zmm30, zmm31, zmm13", + "vpunpckhdq zmm31, zmm31, zmm13", + "vmovdqu32 xmm14, xmmword ptr [rdi + 3 * 16 + 2 * 1024]", + "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 3 * 16 + 6 * 1024], 1", + "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 3 * 16 + 10 * 1024], 2", + "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 3 * 16 + 14 * 1024], 3", + "vmovdqu32 xmm15, xmmword ptr [rdi + 3 * 16 + 3 * 1024]", + "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 3 * 16 + 7 * 1024], 1", + "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 3 * 16 + 11 * 1024], 2", + "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 3 * 16 + 15 * 1024], 3", + "vpunpckldq zmm13, zmm14, zmm15", + "vpunpckhdq zmm14, zmm14, zmm15", + "vpunpcklqdq zmm28, zmm30, zmm13", + "vpunpckhqdq zmm29, zmm30, zmm13", + "vpunpcklqdq zmm30, zmm31, zmm14", + "vpunpckhqdq zmm31, zmm31, zmm14", // Initialize the third and fourth rows of the state, which we just used as scratch space // during transposition. "vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", // IV constants |
