aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2022-03-08 13:00:13 -0500
committerJack O'Connor <[email protected]>2022-03-08 22:23:09 -0500
commitbcbbcc8d2c61e2653d3434b049e3c7d857984bd3 (patch)
treedc35fcbbe1dfd1f106d96e5708013e7b8bb7e1c5
parent78b8e87f91a1c9253a8de2f088d09f5d0c15d990 (diff)
now using only 3 scratch zmm registers
-rw-r--r--src/kernel.rs193
1 files changed, 96 insertions, 97 deletions
diff --git a/src/kernel.rs b/src/kernel.rs
index 09860e4..6d06186 100644
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -828,103 +828,102 @@ global_asm!(
// and invokes blake3_avx512_kernel_16.
// --------------------------------------------------------------------------------------------
"blake3_avx512_blocks_16:",
- "vmovdqu32 xmm8, xmmword ptr [rdi + 0 * 16 + 0 * 1024]",
- "vinserti32x4 zmm8, zmm8, xmmword ptr [rdi + 0 * 16 + 4 * 1024], 1",
- "vinserti32x4 zmm8, zmm8, xmmword ptr [rdi + 0 * 16 + 8 * 1024], 2",
- "vinserti32x4 zmm8, zmm8, xmmword ptr [rdi + 0 * 16 + 12 * 1024], 3",
- "vmovdqu32 xmm9, xmmword ptr [rdi + 0 * 16 + 1 * 1024]",
- "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 5 * 1024], 1",
- "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 9 * 1024], 2",
- "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 13 * 1024], 3",
- "vpunpckldq zmm24, zmm8, zmm9",
- "vpunpckhdq zmm25, zmm8, zmm9",
- "vmovdqu32 xmm10, xmmword ptr [rdi + 0 * 16 + 2 * 1024]",
- "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 6 * 1024], 1",
- "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 10 * 1024], 2",
- "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 14 * 1024], 3",
- "vmovdqu32 xmm11, xmmword ptr [rdi + 0 * 16 + 3 * 1024]",
- "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 7 * 1024], 1",
- "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 11 * 1024], 2",
- "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 15 * 1024], 3",
- "vpunpckldq zmm26, zmm10, zmm11",
- "vpunpckhdq zmm27, zmm10, zmm11",
- "vmovdqu32 xmm12, xmmword ptr [rdi + 1 * 16 + 0 * 1024]",
- "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 4 * 1024], 1",
- "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 8 * 1024], 2",
- "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 12 * 1024], 3",
- "vmovdqu32 xmm13, xmmword ptr [rdi + 1 * 16 + 1 * 1024]",
- "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 5 * 1024], 1",
- "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 9 * 1024], 2",
- "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 13 * 1024], 3",
- "vpunpckldq zmm28, zmm12, zmm13",
- "vpunpckhdq zmm29, zmm12, zmm13",
- "vmovdqu32 xmm14, xmmword ptr [rdi + 1 * 16 + 2 * 1024]",
- "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 6 * 1024], 1",
- "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 10 * 1024], 2",
- "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 14 * 1024], 3",
- "vmovdqu32 xmm15, xmmword ptr [rdi + 1 * 16 + 3 * 1024]",
- "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 7 * 1024], 1",
- "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 11 * 1024], 2",
- "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 15 * 1024], 3",
- "vpunpckldq zmm30, zmm14, zmm15",
- "vpunpckhdq zmm31, zmm14, zmm15",
- "vmovdqu32 xmm16, xmmword ptr [rdi + 2 * 16 + 0 * 1024]",
- "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 4 * 1024], 1",
- "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 8 * 1024], 2",
- "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 12 * 1024], 3",
- "vmovdqu32 xmm17, xmmword ptr [rdi + 2 * 16 + 1 * 1024]",
- "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 5 * 1024], 1",
- "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 9 * 1024], 2",
- "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 13 * 1024], 3",
- "vpunpckldq zmm8, zmm16, zmm17",
- "vpunpckhdq zmm9, zmm16, zmm17",
- "vmovdqu32 xmm18, xmmword ptr [rdi + 2 * 16 + 2 * 1024]",
- "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 6 * 1024], 1",
- "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 10 * 1024], 2",
- "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 14 * 1024], 3",
- "vmovdqu32 xmm19, xmmword ptr [rdi + 2 * 16 + 3 * 1024]",
- "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 7 * 1024], 1",
- "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 11 * 1024], 2",
- "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 15 * 1024], 3",
- "vpunpckldq zmm10, zmm18, zmm19",
- "vpunpckhdq zmm11, zmm18, zmm19",
- "vmovdqu32 xmm20, xmmword ptr [rdi + 3 * 16 + 0 * 1024]",
- "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 4 * 1024], 1",
- "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 8 * 1024], 2",
- "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 12 * 1024], 3",
- "vmovdqu32 xmm21, xmmword ptr [rdi + 3 * 16 + 1 * 1024]",
- "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 5 * 1024], 1",
- "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 9 * 1024], 2",
- "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 13 * 1024], 3",
- "vpunpckldq zmm12, zmm20, zmm21",
- "vpunpckhdq zmm13, zmm20, zmm21",
- "vmovdqu32 xmm22, xmmword ptr [rdi + 3 * 16 + 2 * 1024]",
- "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 6 * 1024], 1",
- "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 10 * 1024], 2",
- "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 14 * 1024], 3",
- "vmovdqu32 xmm23, xmmword ptr [rdi + 3 * 16 + 3 * 1024]",
- "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 7 * 1024], 1",
- "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 11 * 1024], 2",
- "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 15 * 1024], 3",
- "vpunpckldq zmm14, zmm22, zmm23",
- "vpunpckhdq zmm15, zmm22, zmm23",
- // interleave 64-bit words
- "vpunpcklqdq zmm16, zmm24, zmm26",
- "vpunpckhqdq zmm17, zmm24, zmm26",
- "vpunpcklqdq zmm18, zmm25, zmm27",
- "vpunpckhqdq zmm19, zmm25, zmm27",
- "vpunpcklqdq zmm20, zmm28, zmm30",
- "vpunpckhqdq zmm21, zmm28, zmm30",
- "vpunpcklqdq zmm22, zmm29, zmm31",
- "vpunpckhqdq zmm23, zmm29, zmm31",
- "vpunpcklqdq zmm24, zmm8, zmm10",
- "vpunpckhqdq zmm25, zmm8, zmm10",
- "vpunpcklqdq zmm26, zmm9, zmm11",
- "vpunpckhqdq zmm27, zmm9, zmm11",
- "vpunpcklqdq zmm28, zmm12, zmm14",
- "vpunpckhqdq zmm29, zmm12, zmm14",
- "vpunpcklqdq zmm30, zmm13, zmm15",
- "vpunpckhqdq zmm31, zmm13, zmm15",
+ "vmovdqu32 xmm19, xmmword ptr [rdi + 0 * 16 + 0 * 1024]",
+ "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 0 * 16 + 4 * 1024], 1",
+ "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 0 * 16 + 8 * 1024], 2",
+ "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 0 * 16 + 12 * 1024], 3",
+ "vmovdqu32 xmm20, xmmword ptr [rdi + 0 * 16 + 1 * 1024]",
+ "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 0 * 16 + 5 * 1024], 1",
+ "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 0 * 16 + 9 * 1024], 2",
+ "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 0 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm18, zmm19, zmm20",
+ "vpunpckhdq zmm19, zmm19, zmm20",
+ "vmovdqu32 xmm21, xmmword ptr [rdi + 0 * 16 + 2 * 1024]",
+ "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 0 * 16 + 6 * 1024], 1",
+ "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 0 * 16 + 10 * 1024], 2",
+ "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 0 * 16 + 14 * 1024], 3",
+ "vmovdqu32 xmm22, xmmword ptr [rdi + 0 * 16 + 3 * 1024]",
+ "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 0 * 16 + 7 * 1024], 1",
+ "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 0 * 16 + 11 * 1024], 2",
+ "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 0 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm20, zmm21, zmm22",
+ "vpunpckhdq zmm21, zmm21, zmm22",
+ "vpunpcklqdq zmm16, zmm18, zmm20",
+ "vpunpckhqdq zmm17, zmm18, zmm20",
+ "vpunpcklqdq zmm18, zmm19, zmm21",
+ "vpunpckhqdq zmm19, zmm19, zmm21",
+ "vmovdqu32 xmm23, xmmword ptr [rdi + 1 * 16 + 0 * 1024]",
+ "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 1 * 16 + 4 * 1024], 1",
+ "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 1 * 16 + 8 * 1024], 2",
+ "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 1 * 16 + 12 * 1024], 3",
+ "vmovdqu32 xmm24, xmmword ptr [rdi + 1 * 16 + 1 * 1024]",
+ "vinserti32x4 zmm24, zmm24, xmmword ptr [rdi + 1 * 16 + 5 * 1024], 1",
+ "vinserti32x4 zmm24, zmm24, xmmword ptr [rdi + 1 * 16 + 9 * 1024], 2",
+ "vinserti32x4 zmm24, zmm24, xmmword ptr [rdi + 1 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm22, zmm23, zmm24",
+ "vpunpckhdq zmm23, zmm23, zmm24",
+ "vmovdqu32 xmm25, xmmword ptr [rdi + 1 * 16 + 2 * 1024]",
+ "vinserti32x4 zmm25, zmm25, xmmword ptr [rdi + 1 * 16 + 6 * 1024], 1",
+ "vinserti32x4 zmm25, zmm25, xmmword ptr [rdi + 1 * 16 + 10 * 1024], 2",
+ "vinserti32x4 zmm25, zmm25, xmmword ptr [rdi + 1 * 16 + 14 * 1024], 3",
+ "vmovdqu32 xmm26, xmmword ptr [rdi + 1 * 16 + 3 * 1024]",
+ "vinserti32x4 zmm26, zmm26, xmmword ptr [rdi + 1 * 16 + 7 * 1024], 1",
+ "vinserti32x4 zmm26, zmm26, xmmword ptr [rdi + 1 * 16 + 11 * 1024], 2",
+ "vinserti32x4 zmm26, zmm26, xmmword ptr [rdi + 1 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm24, zmm25, zmm26",
+ "vpunpckhdq zmm25, zmm25, zmm26",
+ "vpunpcklqdq zmm20, zmm22, zmm24",
+ "vpunpckhqdq zmm21, zmm22, zmm24",
+ "vpunpcklqdq zmm22, zmm23, zmm25",
+ "vpunpckhqdq zmm23, zmm23, zmm25",
+ "vmovdqu32 xmm27, xmmword ptr [rdi + 2 * 16 + 0 * 1024]",
+ "vinserti32x4 zmm27, zmm27, xmmword ptr [rdi + 2 * 16 + 4 * 1024], 1",
+ "vinserti32x4 zmm27, zmm27, xmmword ptr [rdi + 2 * 16 + 8 * 1024], 2",
+ "vinserti32x4 zmm27, zmm27, xmmword ptr [rdi + 2 * 16 + 12 * 1024], 3",
+ "vmovdqu32 xmm28, xmmword ptr [rdi + 2 * 16 + 1 * 1024]",
+ "vinserti32x4 zmm28, zmm28, xmmword ptr [rdi + 2 * 16 + 5 * 1024], 1",
+ "vinserti32x4 zmm28, zmm28, xmmword ptr [rdi + 2 * 16 + 9 * 1024], 2",
+ "vinserti32x4 zmm28, zmm28, xmmword ptr [rdi + 2 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm26, zmm27, zmm28",
+ "vpunpckhdq zmm27, zmm27, zmm28",
+ "vmovdqu32 xmm29, xmmword ptr [rdi + 2 * 16 + 2 * 1024]",
+ "vinserti32x4 zmm29, zmm29, xmmword ptr [rdi + 2 * 16 + 6 * 1024], 1",
+ "vinserti32x4 zmm29, zmm29, xmmword ptr [rdi + 2 * 16 + 10 * 1024], 2",
+ "vinserti32x4 zmm29, zmm29, xmmword ptr [rdi + 2 * 16 + 14 * 1024], 3",
+ "vmovdqu32 xmm30, xmmword ptr [rdi + 2 * 16 + 3 * 1024]",
+ "vinserti32x4 zmm30, zmm30, xmmword ptr [rdi + 2 * 16 + 7 * 1024], 1",
+ "vinserti32x4 zmm30, zmm30, xmmword ptr [rdi + 2 * 16 + 11 * 1024], 2",
+ "vinserti32x4 zmm30, zmm30, xmmword ptr [rdi + 2 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm28, zmm29, zmm30",
+ "vpunpckhdq zmm29, zmm29, zmm30",
+ "vpunpcklqdq zmm24, zmm26, zmm28",
+ "vpunpckhqdq zmm25, zmm26, zmm28",
+ "vpunpcklqdq zmm26, zmm27, zmm29",
+ "vpunpckhqdq zmm27, zmm27, zmm29",
+ "vmovdqu32 xmm31, xmmword ptr [rdi + 3 * 16 + 0 * 1024]",
+ "vinserti32x4 zmm31, zmm31, xmmword ptr [rdi + 3 * 16 + 4 * 1024], 1",
+ "vinserti32x4 zmm31, zmm31, xmmword ptr [rdi + 3 * 16 + 8 * 1024], 2",
+ "vinserti32x4 zmm31, zmm31, xmmword ptr [rdi + 3 * 16 + 12 * 1024], 3",
+ "vmovdqu32 xmm13, xmmword ptr [rdi + 3 * 16 + 1 * 1024]",
+ "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 3 * 16 + 5 * 1024], 1",
+ "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 3 * 16 + 9 * 1024], 2",
+ "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 3 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm30, zmm31, zmm13",
+ "vpunpckhdq zmm31, zmm31, zmm13",
+ "vmovdqu32 xmm14, xmmword ptr [rdi + 3 * 16 + 2 * 1024]",
+ "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 3 * 16 + 6 * 1024], 1",
+ "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 3 * 16 + 10 * 1024], 2",
+ "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 3 * 16 + 14 * 1024], 3",
+ "vmovdqu32 xmm15, xmmword ptr [rdi + 3 * 16 + 3 * 1024]",
+ "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 3 * 16 + 7 * 1024], 1",
+ "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 3 * 16 + 11 * 1024], 2",
+ "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 3 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm13, zmm14, zmm15",
+ "vpunpckhdq zmm14, zmm14, zmm15",
+ "vpunpcklqdq zmm28, zmm30, zmm13",
+ "vpunpckhqdq zmm29, zmm30, zmm13",
+ "vpunpcklqdq zmm30, zmm31, zmm14",
+ "vpunpckhqdq zmm31, zmm31, zmm14",
// Initialize the third and fourth rows of the state, which we just used as scratch space
// during transposition.
"vmovdqa32 zmm8, zmmword ptr [BLAKE3_IV0_16 + rip]", // IV constants