aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2022-03-09 00:52:00 -0500
committerJack O'Connor <[email protected]>2022-03-09 00:56:09 -0500
commitdeac82543627cf79010b58cf1472c8261456dcbf (patch)
treee7c60a2ec47562d56a76d9821db0d230f48507a7
parent4c929ddac1ac3d39a1285a1527fd916d7934d7ad (diff)
interleave the write ops in blake3_avx512_xor_stream_16
This seems to give a small but consistent performance boost.
-rw-r--r--src/kernel.rs154
1 files changed, 78 insertions, 76 deletions
diff --git a/src/kernel.rs b/src/kernel.rs
index eeab5e7..7542565 100644
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -1245,105 +1245,107 @@ global_asm!(
// And produces vectors like:
//
// a0, a1, b0, b1, e0, e1, g0, g1, i0, i1, k0, k1, m0, m1, o0, o1
+ //
+ // Then interleave 64-bit words back into zmm0-zmm15, producing vectors like:
+ //
+ // a0, a1, a2, a3, e0, e1, e2, e3, i0, i1, i2, i3, m0, m1, m2, m3
+ //
+ // Finally, write out each 128-bit group, unaligned.
"vpunpckldq zmm16, zmm0, zmm1",
"vpunpckhdq zmm17, zmm0, zmm1",
"vpunpckldq zmm18, zmm2, zmm3",
"vpunpckhdq zmm19, zmm2, zmm3",
- "vpunpckldq zmm20, zmm4, zmm5",
- "vpunpckhdq zmm21, zmm4, zmm5",
- "vpunpckldq zmm22, zmm6, zmm7",
- "vpunpckhdq zmm23, zmm6, zmm7",
- "vpunpckldq zmm24, zmm8, zmm9",
- "vpunpckhdq zmm25, zmm8, zmm9",
- "vpunpckldq zmm26, zmm10, zmm11",
- "vpunpckhdq zmm27, zmm10, zmm11",
- "vpunpckldq zmm28, zmm12, zmm13",
- "vpunpckhdq zmm29, zmm12, zmm13",
- "vpunpckldq zmm30, zmm14, zmm15",
- "vpunpckhdq zmm31, zmm14, zmm15",
- // Then interleave 64-bit words back into zmm0-zmm15, producing vectors like:
- //
- // a0, a1, a2, a3, e0, e1, e2, e3, i0, i1, i2, i3, m0, m1, m2, m3
"vpunpcklqdq zmm0, zmm16, zmm18",
+ "vmovdqu32 xmmword ptr [r9 + 0 * 16], xmm0",
+ "vextracti32x4 xmmword ptr [r9 + 16 * 16], zmm0, 1",
+ "vextracti32x4 xmmword ptr [r9 + 32 * 16], zmm0, 2",
+ "vextracti32x4 xmmword ptr [r9 + 48 * 16], zmm0, 3",
"vpunpckhqdq zmm1, zmm16, zmm18",
+ "vmovdqu32 xmmword ptr [r9 + 4 * 16], xmm1",
+ "vextracti32x4 xmmword ptr [r9 + 20 * 16], zmm1, 1",
+ "vextracti32x4 xmmword ptr [r9 + 36 * 16], zmm1, 2",
+ "vextracti32x4 xmmword ptr [r9 + 52 * 16], zmm1, 3",
"vpunpcklqdq zmm2, zmm17, zmm19",
+ "vmovdqu32 xmmword ptr [r9 + 8 * 16], xmm2",
+ "vextracti32x4 xmmword ptr [r9 + 24 * 16], zmm2, 1",
+ "vextracti32x4 xmmword ptr [r9 + 40 * 16], zmm2, 2",
+ "vextracti32x4 xmmword ptr [r9 + 56 * 16], zmm2, 3",
"vpunpckhqdq zmm3, zmm17, zmm19",
+ "vmovdqu32 xmmword ptr [r9 + 12 * 16], xmm3",
+ "vextracti32x4 xmmword ptr [r9 + 28 * 16], zmm3, 1",
+ "vextracti32x4 xmmword ptr [r9 + 44 * 16], zmm3, 2",
+ "vextracti32x4 xmmword ptr [r9 + 60 * 16], zmm3, 3",
+ "vpunpckldq zmm20, zmm4, zmm5",
+ "vpunpckhdq zmm21, zmm4, zmm5",
+ "vpunpckldq zmm22, zmm6, zmm7",
+ "vpunpckhdq zmm23, zmm6, zmm7",
"vpunpcklqdq zmm4, zmm20, zmm22",
+ "vmovdqu32 xmmword ptr [r9 + 1 * 16], xmm4",
+ "vextracti32x4 xmmword ptr [r9 + 17 * 16], zmm4, 1",
+ "vextracti32x4 xmmword ptr [r9 + 33 * 16], zmm4, 2",
+ "vextracti32x4 xmmword ptr [r9 + 49 * 16], zmm4, 3",
"vpunpckhqdq zmm5, zmm20, zmm22",
+ "vmovdqu32 xmmword ptr [r9 + 5 * 16], xmm5",
+ "vextracti32x4 xmmword ptr [r9 + 21 * 16], zmm5, 1",
+ "vextracti32x4 xmmword ptr [r9 + 37 * 16], zmm5, 2",
+ "vextracti32x4 xmmword ptr [r9 + 53 * 16], zmm5, 3",
"vpunpcklqdq zmm6, zmm21, zmm23",
+ "vmovdqu32 xmmword ptr [r9 + 9 * 16], xmm6",
+ "vextracti32x4 xmmword ptr [r9 + 25 * 16], zmm6, 1",
+ "vextracti32x4 xmmword ptr [r9 + 41 * 16], zmm6, 2",
+ "vextracti32x4 xmmword ptr [r9 + 57 * 16], zmm6, 3",
"vpunpckhqdq zmm7, zmm21, zmm23",
+ "vmovdqu32 xmmword ptr [r9 + 13 * 16], xmm7",
+ "vextracti32x4 xmmword ptr [r9 + 29 * 16], zmm7, 1",
+ "vextracti32x4 xmmword ptr [r9 + 45 * 16], zmm7, 2",
+ "vextracti32x4 xmmword ptr [r9 + 61 * 16], zmm7, 3",
+ "vpunpckldq zmm24, zmm8, zmm9",
+ "vpunpckhdq zmm25, zmm8, zmm9",
+ "vpunpckldq zmm26, zmm10, zmm11",
+ "vpunpckhdq zmm27, zmm10, zmm11",
"vpunpcklqdq zmm8, zmm24, zmm26",
+ "vmovdqu32 xmmword ptr [r9 + 2 * 16], xmm8",
+ "vextracti32x4 xmmword ptr [r9 + 18 * 16], zmm8, 1",
+ "vextracti32x4 xmmword ptr [r9 + 34 * 16], zmm8, 2",
+ "vextracti32x4 xmmword ptr [r9 + 50 * 16], zmm8, 3",
"vpunpckhqdq zmm9, zmm24, zmm26",
+ "vmovdqu32 xmmword ptr [r9 + 6 * 16], xmm9",
+ "vextracti32x4 xmmword ptr [r9 + 22 * 16], zmm9, 1",
+ "vextracti32x4 xmmword ptr [r9 + 38 * 16], zmm9, 2",
+ "vextracti32x4 xmmword ptr [r9 + 54 * 16], zmm9, 3",
"vpunpcklqdq zmm10, zmm25, zmm27",
+ "vmovdqu32 xmmword ptr [r9 + 10 * 16], xmm10",
+ "vextracti32x4 xmmword ptr [r9 + 26 * 16], zmm10, 1",
+ "vextracti32x4 xmmword ptr [r9 + 42 * 16], zmm10, 2",
+ "vextracti32x4 xmmword ptr [r9 + 58 * 16], zmm10, 3",
"vpunpckhqdq zmm11, zmm25, zmm27",
+ "vmovdqu32 xmmword ptr [r9 + 14 * 16], xmm11",
+ "vextracti32x4 xmmword ptr [r9 + 30 * 16], zmm11, 1",
+ "vextracti32x4 xmmword ptr [r9 + 46 * 16], zmm11, 2",
+ "vextracti32x4 xmmword ptr [r9 + 62 * 16], zmm11, 3",
+ "vpunpckldq zmm28, zmm12, zmm13",
+ "vpunpckhdq zmm29, zmm12, zmm13",
+ "vpunpckldq zmm30, zmm14, zmm15",
+ "vpunpckhdq zmm31, zmm14, zmm15",
"vpunpcklqdq zmm12, zmm28, zmm30",
- "vpunpckhqdq zmm13, zmm28, zmm30",
- "vpunpcklqdq zmm14, zmm29, zmm31",
- "vpunpckhqdq zmm15, zmm29, zmm31",
- // Finally, write out each 128-bit group, unaligned.
- "vmovdqu32 xmmword ptr [r9 + 0 * 16], xmm0",
- "vmovdqu32 xmmword ptr [r9 + 1 * 16], xmm4",
- "vmovdqu32 xmmword ptr [r9 + 2 * 16], xmm8",
"vmovdqu32 xmmword ptr [r9 + 3 * 16], xmm12",
- "vmovdqu32 xmmword ptr [r9 + 4 * 16], xmm1",
- "vmovdqu32 xmmword ptr [r9 + 5 * 16], xmm5",
- "vmovdqu32 xmmword ptr [r9 + 6 * 16], xmm9",
- "vmovdqu32 xmmword ptr [r9 + 7 * 16], xmm13",
- "vmovdqu32 xmmword ptr [r9 + 8 * 16], xmm2",
- "vmovdqu32 xmmword ptr [r9 + 9 * 16], xmm6",
- "vmovdqu32 xmmword ptr [r9 + 10 * 16], xmm10",
- "vmovdqu32 xmmword ptr [r9 + 11 * 16], xmm14",
- "vmovdqu32 xmmword ptr [r9 + 12 * 16], xmm3",
- "vmovdqu32 xmmword ptr [r9 + 13 * 16], xmm7",
- "vmovdqu32 xmmword ptr [r9 + 14 * 16], xmm11",
- "vmovdqu32 xmmword ptr [r9 + 15 * 16], xmm15",
- "vextracti32x4 xmmword ptr [r9 + 16 * 16], zmm0, 1",
- "vextracti32x4 xmmword ptr [r9 + 17 * 16], zmm4, 1",
- "vextracti32x4 xmmword ptr [r9 + 18 * 16], zmm8, 1",
"vextracti32x4 xmmword ptr [r9 + 19 * 16], zmm12, 1",
- "vextracti32x4 xmmword ptr [r9 + 20 * 16], zmm1, 1",
- "vextracti32x4 xmmword ptr [r9 + 21 * 16], zmm5, 1",
- "vextracti32x4 xmmword ptr [r9 + 22 * 16], zmm9, 1",
- "vextracti32x4 xmmword ptr [r9 + 23 * 16], zmm13, 1",
- "vextracti32x4 xmmword ptr [r9 + 24 * 16], zmm2, 1",
- "vextracti32x4 xmmword ptr [r9 + 25 * 16], zmm6, 1",
- "vextracti32x4 xmmword ptr [r9 + 26 * 16], zmm10, 1",
- "vextracti32x4 xmmword ptr [r9 + 27 * 16], zmm14, 1",
- "vextracti32x4 xmmword ptr [r9 + 28 * 16], zmm3, 1",
- "vextracti32x4 xmmword ptr [r9 + 29 * 16], zmm7, 1",
- "vextracti32x4 xmmword ptr [r9 + 30 * 16], zmm11, 1",
- "vextracti32x4 xmmword ptr [r9 + 31 * 16], zmm15, 1",
- "vextracti32x4 xmmword ptr [r9 + 32 * 16], zmm0, 2",
- "vextracti32x4 xmmword ptr [r9 + 33 * 16], zmm4, 2",
- "vextracti32x4 xmmword ptr [r9 + 34 * 16], zmm8, 2",
"vextracti32x4 xmmword ptr [r9 + 35 * 16], zmm12, 2",
- "vextracti32x4 xmmword ptr [r9 + 36 * 16], zmm1, 2",
- "vextracti32x4 xmmword ptr [r9 + 37 * 16], zmm5, 2",
- "vextracti32x4 xmmword ptr [r9 + 38 * 16], zmm9, 2",
- "vextracti32x4 xmmword ptr [r9 + 39 * 16], zmm13, 2",
- "vextracti32x4 xmmword ptr [r9 + 40 * 16], zmm2, 2",
- "vextracti32x4 xmmword ptr [r9 + 41 * 16], zmm6, 2",
- "vextracti32x4 xmmword ptr [r9 + 42 * 16], zmm10, 2",
- "vextracti32x4 xmmword ptr [r9 + 43 * 16], zmm14, 2",
- "vextracti32x4 xmmword ptr [r9 + 44 * 16], zmm3, 2",
- "vextracti32x4 xmmword ptr [r9 + 45 * 16], zmm7, 2",
- "vextracti32x4 xmmword ptr [r9 + 46 * 16], zmm11, 2",
- "vextracti32x4 xmmword ptr [r9 + 47 * 16], zmm15, 2",
- "vextracti32x4 xmmword ptr [r9 + 48 * 16], zmm0, 3",
- "vextracti32x4 xmmword ptr [r9 + 49 * 16], zmm4, 3",
- "vextracti32x4 xmmword ptr [r9 + 50 * 16], zmm8, 3",
"vextracti32x4 xmmword ptr [r9 + 51 * 16], zmm12, 3",
- "vextracti32x4 xmmword ptr [r9 + 52 * 16], zmm1, 3",
- "vextracti32x4 xmmword ptr [r9 + 53 * 16], zmm5, 3",
- "vextracti32x4 xmmword ptr [r9 + 54 * 16], zmm9, 3",
+ "vpunpckhqdq zmm13, zmm28, zmm30",
+ "vmovdqu32 xmmword ptr [r9 + 7 * 16], xmm13",
+ "vextracti32x4 xmmword ptr [r9 + 23 * 16], zmm13, 1",
+ "vextracti32x4 xmmword ptr [r9 + 39 * 16], zmm13, 2",
"vextracti32x4 xmmword ptr [r9 + 55 * 16], zmm13, 3",
- "vextracti32x4 xmmword ptr [r9 + 56 * 16], zmm2, 3",
- "vextracti32x4 xmmword ptr [r9 + 57 * 16], zmm6, 3",
- "vextracti32x4 xmmword ptr [r9 + 58 * 16], zmm10, 3",
+ "vpunpcklqdq zmm14, zmm29, zmm31",
+ "vmovdqu32 xmmword ptr [r9 + 11 * 16], xmm14",
+ "vextracti32x4 xmmword ptr [r9 + 27 * 16], zmm14, 1",
+ "vextracti32x4 xmmword ptr [r9 + 43 * 16], zmm14, 2",
"vextracti32x4 xmmword ptr [r9 + 59 * 16], zmm14, 3",
- "vextracti32x4 xmmword ptr [r9 + 60 * 16], zmm3, 3",
- "vextracti32x4 xmmword ptr [r9 + 61 * 16], zmm7, 3",
- "vextracti32x4 xmmword ptr [r9 + 62 * 16], zmm11, 3",
+ "vpunpckhqdq zmm15, zmm29, zmm31",
+ "vmovdqu32 xmmword ptr [r9 + 15 * 16], xmm15",
+ "vextracti32x4 xmmword ptr [r9 + 31 * 16], zmm15, 1",
+ "vextracti32x4 xmmword ptr [r9 + 47 * 16], zmm15, 2",
"vextracti32x4 xmmword ptr [r9 + 63 * 16], zmm15, 3",
"vzeroupper",
"ret",