diff options
| author | Jack O'Connor <[email protected]> | 2022-03-09 00:52:00 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-03-09 00:56:09 -0500 |
| commit | deac82543627cf79010b58cf1472c8261456dcbf (patch) | |
| tree | e7c60a2ec47562d56a76d9821db0d230f48507a7 | |
| parent | 4c929ddac1ac3d39a1285a1527fd916d7934d7ad (diff) | |
interleave the write ops in blake3_avx512_xor_stream_16
This seems to give a small but consistent performance boost.
| -rw-r--r-- | src/kernel.rs | 154 |
1 files changed, 78 insertions, 76 deletions
diff --git a/src/kernel.rs b/src/kernel.rs index eeab5e7..7542565 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -1245,105 +1245,107 @@ global_asm!( // And produces vectors like: // // a0, a1, b0, b1, e0, e1, g0, g1, i0, i1, k0, k1, m0, m1, o0, o1 + // + // Then interleave 64-bit words back into zmm0-zmm15, producing vectors like: + // + // a0, a1, a2, a3, e0, e1, e2, e3, i0, i1, i2, i3, m0, m1, m2, m3 + // + // Finally, write out each 128-bit group, unaligned. "vpunpckldq zmm16, zmm0, zmm1", "vpunpckhdq zmm17, zmm0, zmm1", "vpunpckldq zmm18, zmm2, zmm3", "vpunpckhdq zmm19, zmm2, zmm3", - "vpunpckldq zmm20, zmm4, zmm5", - "vpunpckhdq zmm21, zmm4, zmm5", - "vpunpckldq zmm22, zmm6, zmm7", - "vpunpckhdq zmm23, zmm6, zmm7", - "vpunpckldq zmm24, zmm8, zmm9", - "vpunpckhdq zmm25, zmm8, zmm9", - "vpunpckldq zmm26, zmm10, zmm11", - "vpunpckhdq zmm27, zmm10, zmm11", - "vpunpckldq zmm28, zmm12, zmm13", - "vpunpckhdq zmm29, zmm12, zmm13", - "vpunpckldq zmm30, zmm14, zmm15", - "vpunpckhdq zmm31, zmm14, zmm15", - // Then interleave 64-bit words back into zmm0-zmm15, producing vectors like: - // - // a0, a1, a2, a3, e0, e1, e2, e3, i0, i1, i2, i3, m0, m1, m2, m3 "vpunpcklqdq zmm0, zmm16, zmm18", + "vmovdqu32 xmmword ptr [r9 + 0 * 16], xmm0", + "vextracti32x4 xmmword ptr [r9 + 16 * 16], zmm0, 1", + "vextracti32x4 xmmword ptr [r9 + 32 * 16], zmm0, 2", + "vextracti32x4 xmmword ptr [r9 + 48 * 16], zmm0, 3", "vpunpckhqdq zmm1, zmm16, zmm18", + "vmovdqu32 xmmword ptr [r9 + 4 * 16], xmm1", + "vextracti32x4 xmmword ptr [r9 + 20 * 16], zmm1, 1", + "vextracti32x4 xmmword ptr [r9 + 36 * 16], zmm1, 2", + "vextracti32x4 xmmword ptr [r9 + 52 * 16], zmm1, 3", "vpunpcklqdq zmm2, zmm17, zmm19", + "vmovdqu32 xmmword ptr [r9 + 8 * 16], xmm2", + "vextracti32x4 xmmword ptr [r9 + 24 * 16], zmm2, 1", + "vextracti32x4 xmmword ptr [r9 + 40 * 16], zmm2, 2", + "vextracti32x4 xmmword ptr [r9 + 56 * 16], zmm2, 3", "vpunpckhqdq zmm3, zmm17, zmm19", + "vmovdqu32 xmmword ptr [r9 + 12 * 16], xmm3", + "vextracti32x4 xmmword ptr [r9 + 28 * 16], zmm3, 1", + "vextracti32x4 xmmword ptr [r9 + 44 * 16], zmm3, 2", + "vextracti32x4 xmmword ptr [r9 + 60 * 16], zmm3, 3", + "vpunpckldq zmm20, zmm4, zmm5", + "vpunpckhdq zmm21, zmm4, zmm5", + "vpunpckldq zmm22, zmm6, zmm7", + "vpunpckhdq zmm23, zmm6, zmm7", "vpunpcklqdq zmm4, zmm20, zmm22", + "vmovdqu32 xmmword ptr [r9 + 1 * 16], xmm4", + "vextracti32x4 xmmword ptr [r9 + 17 * 16], zmm4, 1", + "vextracti32x4 xmmword ptr [r9 + 33 * 16], zmm4, 2", + "vextracti32x4 xmmword ptr [r9 + 49 * 16], zmm4, 3", "vpunpckhqdq zmm5, zmm20, zmm22", + "vmovdqu32 xmmword ptr [r9 + 5 * 16], xmm5", + "vextracti32x4 xmmword ptr [r9 + 21 * 16], zmm5, 1", + "vextracti32x4 xmmword ptr [r9 + 37 * 16], zmm5, 2", + "vextracti32x4 xmmword ptr [r9 + 53 * 16], zmm5, 3", "vpunpcklqdq zmm6, zmm21, zmm23", + "vmovdqu32 xmmword ptr [r9 + 9 * 16], xmm6", + "vextracti32x4 xmmword ptr [r9 + 25 * 16], zmm6, 1", + "vextracti32x4 xmmword ptr [r9 + 41 * 16], zmm6, 2", + "vextracti32x4 xmmword ptr [r9 + 57 * 16], zmm6, 3", "vpunpckhqdq zmm7, zmm21, zmm23", + "vmovdqu32 xmmword ptr [r9 + 13 * 16], xmm7", + "vextracti32x4 xmmword ptr [r9 + 29 * 16], zmm7, 1", + "vextracti32x4 xmmword ptr [r9 + 45 * 16], zmm7, 2", + "vextracti32x4 xmmword ptr [r9 + 61 * 16], zmm7, 3", + "vpunpckldq zmm24, zmm8, zmm9", + "vpunpckhdq zmm25, zmm8, zmm9", + "vpunpckldq zmm26, zmm10, zmm11", + "vpunpckhdq zmm27, zmm10, zmm11", "vpunpcklqdq zmm8, zmm24, zmm26", + "vmovdqu32 xmmword ptr [r9 + 2 * 16], xmm8", + "vextracti32x4 xmmword ptr [r9 + 18 * 16], zmm8, 1", + "vextracti32x4 xmmword ptr [r9 + 34 * 16], zmm8, 2", + "vextracti32x4 xmmword ptr [r9 + 50 * 16], zmm8, 3", "vpunpckhqdq zmm9, zmm24, zmm26", + "vmovdqu32 xmmword ptr [r9 + 6 * 16], xmm9", + "vextracti32x4 xmmword ptr [r9 + 22 * 16], zmm9, 1", + "vextracti32x4 xmmword ptr [r9 + 38 * 16], zmm9, 2", + "vextracti32x4 xmmword ptr [r9 + 54 * 16], zmm9, 3", "vpunpcklqdq zmm10, zmm25, zmm27", + "vmovdqu32 xmmword ptr [r9 + 10 * 16], xmm10", + "vextracti32x4 xmmword ptr [r9 + 26 * 16], zmm10, 1", + "vextracti32x4 xmmword ptr [r9 + 42 * 16], zmm10, 2", + "vextracti32x4 xmmword ptr [r9 + 58 * 16], zmm10, 3", "vpunpckhqdq zmm11, zmm25, zmm27", + "vmovdqu32 xmmword ptr [r9 + 14 * 16], xmm11", + "vextracti32x4 xmmword ptr [r9 + 30 * 16], zmm11, 1", + "vextracti32x4 xmmword ptr [r9 + 46 * 16], zmm11, 2", + "vextracti32x4 xmmword ptr [r9 + 62 * 16], zmm11, 3", + "vpunpckldq zmm28, zmm12, zmm13", + "vpunpckhdq zmm29, zmm12, zmm13", + "vpunpckldq zmm30, zmm14, zmm15", + "vpunpckhdq zmm31, zmm14, zmm15", "vpunpcklqdq zmm12, zmm28, zmm30", - "vpunpckhqdq zmm13, zmm28, zmm30", - "vpunpcklqdq zmm14, zmm29, zmm31", - "vpunpckhqdq zmm15, zmm29, zmm31", - // Finally, write out each 128-bit group, unaligned. - "vmovdqu32 xmmword ptr [r9 + 0 * 16], xmm0", - "vmovdqu32 xmmword ptr [r9 + 1 * 16], xmm4", - "vmovdqu32 xmmword ptr [r9 + 2 * 16], xmm8", "vmovdqu32 xmmword ptr [r9 + 3 * 16], xmm12", - "vmovdqu32 xmmword ptr [r9 + 4 * 16], xmm1", - "vmovdqu32 xmmword ptr [r9 + 5 * 16], xmm5", - "vmovdqu32 xmmword ptr [r9 + 6 * 16], xmm9", - "vmovdqu32 xmmword ptr [r9 + 7 * 16], xmm13", - "vmovdqu32 xmmword ptr [r9 + 8 * 16], xmm2", - "vmovdqu32 xmmword ptr [r9 + 9 * 16], xmm6", - "vmovdqu32 xmmword ptr [r9 + 10 * 16], xmm10", - "vmovdqu32 xmmword ptr [r9 + 11 * 16], xmm14", - "vmovdqu32 xmmword ptr [r9 + 12 * 16], xmm3", - "vmovdqu32 xmmword ptr [r9 + 13 * 16], xmm7", - "vmovdqu32 xmmword ptr [r9 + 14 * 16], xmm11", - "vmovdqu32 xmmword ptr [r9 + 15 * 16], xmm15", - "vextracti32x4 xmmword ptr [r9 + 16 * 16], zmm0, 1", - "vextracti32x4 xmmword ptr [r9 + 17 * 16], zmm4, 1", - "vextracti32x4 xmmword ptr [r9 + 18 * 16], zmm8, 1", "vextracti32x4 xmmword ptr [r9 + 19 * 16], zmm12, 1", - "vextracti32x4 xmmword ptr [r9 + 20 * 16], zmm1, 1", - "vextracti32x4 xmmword ptr [r9 + 21 * 16], zmm5, 1", - "vextracti32x4 xmmword ptr [r9 + 22 * 16], zmm9, 1", - "vextracti32x4 xmmword ptr [r9 + 23 * 16], zmm13, 1", - "vextracti32x4 xmmword ptr [r9 + 24 * 16], zmm2, 1", - "vextracti32x4 xmmword ptr [r9 + 25 * 16], zmm6, 1", - "vextracti32x4 xmmword ptr [r9 + 26 * 16], zmm10, 1", - "vextracti32x4 xmmword ptr [r9 + 27 * 16], zmm14, 1", - "vextracti32x4 xmmword ptr [r9 + 28 * 16], zmm3, 1", - "vextracti32x4 xmmword ptr [r9 + 29 * 16], zmm7, 1", - "vextracti32x4 xmmword ptr [r9 + 30 * 16], zmm11, 1", - "vextracti32x4 xmmword ptr [r9 + 31 * 16], zmm15, 1", - "vextracti32x4 xmmword ptr [r9 + 32 * 16], zmm0, 2", - "vextracti32x4 xmmword ptr [r9 + 33 * 16], zmm4, 2", - "vextracti32x4 xmmword ptr [r9 + 34 * 16], zmm8, 2", "vextracti32x4 xmmword ptr [r9 + 35 * 16], zmm12, 2", - "vextracti32x4 xmmword ptr [r9 + 36 * 16], zmm1, 2", - "vextracti32x4 xmmword ptr [r9 + 37 * 16], zmm5, 2", - "vextracti32x4 xmmword ptr [r9 + 38 * 16], zmm9, 2", - "vextracti32x4 xmmword ptr [r9 + 39 * 16], zmm13, 2", - "vextracti32x4 xmmword ptr [r9 + 40 * 16], zmm2, 2", - "vextracti32x4 xmmword ptr [r9 + 41 * 16], zmm6, 2", - "vextracti32x4 xmmword ptr [r9 + 42 * 16], zmm10, 2", - "vextracti32x4 xmmword ptr [r9 + 43 * 16], zmm14, 2", - "vextracti32x4 xmmword ptr [r9 + 44 * 16], zmm3, 2", - "vextracti32x4 xmmword ptr [r9 + 45 * 16], zmm7, 2", - "vextracti32x4 xmmword ptr [r9 + 46 * 16], zmm11, 2", - "vextracti32x4 xmmword ptr [r9 + 47 * 16], zmm15, 2", - "vextracti32x4 xmmword ptr [r9 + 48 * 16], zmm0, 3", - "vextracti32x4 xmmword ptr [r9 + 49 * 16], zmm4, 3", - "vextracti32x4 xmmword ptr [r9 + 50 * 16], zmm8, 3", "vextracti32x4 xmmword ptr [r9 + 51 * 16], zmm12, 3", - "vextracti32x4 xmmword ptr [r9 + 52 * 16], zmm1, 3", - "vextracti32x4 xmmword ptr [r9 + 53 * 16], zmm5, 3", - "vextracti32x4 xmmword ptr [r9 + 54 * 16], zmm9, 3", + "vpunpckhqdq zmm13, zmm28, zmm30", + "vmovdqu32 xmmword ptr [r9 + 7 * 16], xmm13", + "vextracti32x4 xmmword ptr [r9 + 23 * 16], zmm13, 1", + "vextracti32x4 xmmword ptr [r9 + 39 * 16], zmm13, 2", "vextracti32x4 xmmword ptr [r9 + 55 * 16], zmm13, 3", - "vextracti32x4 xmmword ptr [r9 + 56 * 16], zmm2, 3", - "vextracti32x4 xmmword ptr [r9 + 57 * 16], zmm6, 3", - "vextracti32x4 xmmword ptr [r9 + 58 * 16], zmm10, 3", + "vpunpcklqdq zmm14, zmm29, zmm31", + "vmovdqu32 xmmword ptr [r9 + 11 * 16], xmm14", + "vextracti32x4 xmmword ptr [r9 + 27 * 16], zmm14, 1", + "vextracti32x4 xmmword ptr [r9 + 43 * 16], zmm14, 2", "vextracti32x4 xmmword ptr [r9 + 59 * 16], zmm14, 3", - "vextracti32x4 xmmword ptr [r9 + 60 * 16], zmm3, 3", - "vextracti32x4 xmmword ptr [r9 + 61 * 16], zmm7, 3", - "vextracti32x4 xmmword ptr [r9 + 62 * 16], zmm11, 3", + "vpunpckhqdq zmm15, zmm29, zmm31", + "vmovdqu32 xmmword ptr [r9 + 15 * 16], xmm15", + "vextracti32x4 xmmword ptr [r9 + 31 * 16], zmm15, 1", + "vextracti32x4 xmmword ptr [r9 + 47 * 16], zmm15, 2", "vextracti32x4 xmmword ptr [r9 + 63 * 16], zmm15, 3", "vzeroupper", "ret", |
