diff options
| author | Jack O'Connor <[email protected]> | 2022-03-08 12:03:03 -0500 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2022-03-08 22:23:09 -0500 |
| commit | 78b8e87f91a1c9253a8de2f088d09f5d0c15d990 (patch) | |
| tree | feb1556f7607be5681503eca5172289da76cf2d1 | |
| parent | 87a9318233fe8c5347453afecaa69242e09c929c (diff) | |
interleave the first pass -- good performance
| -rw-r--r-- | src/kernel.rs | 29 |
1 files changed, 14 insertions, 15 deletions
diff --git a/src/kernel.rs b/src/kernel.rs index 8336661..09860e4 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -836,6 +836,8 @@ global_asm!( "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 5 * 1024], 1", "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 9 * 1024], 2", "vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 13 * 1024], 3", + "vpunpckldq zmm24, zmm8, zmm9", + "vpunpckhdq zmm25, zmm8, zmm9", "vmovdqu32 xmm10, xmmword ptr [rdi + 0 * 16 + 2 * 1024]", "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 6 * 1024], 1", "vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 10 * 1024], 2", @@ -844,6 +846,8 @@ global_asm!( "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 7 * 1024], 1", "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 11 * 1024], 2", "vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 15 * 1024], 3", + "vpunpckldq zmm26, zmm10, zmm11", + "vpunpckhdq zmm27, zmm10, zmm11", "vmovdqu32 xmm12, xmmword ptr [rdi + 1 * 16 + 0 * 1024]", "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 4 * 1024], 1", "vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 8 * 1024], 2", @@ -852,6 +856,8 @@ global_asm!( "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 5 * 1024], 1", "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 9 * 1024], 2", "vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 13 * 1024], 3", + "vpunpckldq zmm28, zmm12, zmm13", + "vpunpckhdq zmm29, zmm12, zmm13", "vmovdqu32 xmm14, xmmword ptr [rdi + 1 * 16 + 2 * 1024]", "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 6 * 1024], 1", "vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 10 * 1024], 2", @@ -860,6 +866,8 @@ global_asm!( "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 7 * 1024], 1", "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 11 * 1024], 2", "vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 15 * 1024], 3", + "vpunpckldq zmm30, zmm14, zmm15", + "vpunpckhdq zmm31, zmm14, zmm15", "vmovdqu32 xmm16, xmmword ptr [rdi + 2 * 16 + 0 * 1024]", "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 4 * 1024], 1", "vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 8 * 1024], 2", @@ -868,6 +876,8 @@ global_asm!( "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 5 * 1024], 1", "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 9 * 1024], 2", "vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 13 * 1024], 3", + "vpunpckldq zmm8, zmm16, zmm17", + "vpunpckhdq zmm9, zmm16, zmm17", "vmovdqu32 xmm18, xmmword ptr [rdi + 2 * 16 + 2 * 1024]", "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 6 * 1024], 1", "vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 10 * 1024], 2", @@ -876,6 +886,8 @@ global_asm!( "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 7 * 1024], 1", "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 11 * 1024], 2", "vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 15 * 1024], 3", + "vpunpckldq zmm10, zmm18, zmm19", + "vpunpckhdq zmm11, zmm18, zmm19", "vmovdqu32 xmm20, xmmword ptr [rdi + 3 * 16 + 0 * 1024]", "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 4 * 1024], 1", "vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 8 * 1024], 2", @@ -884,6 +896,8 @@ global_asm!( "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 5 * 1024], 1", "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 9 * 1024], 2", "vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 13 * 1024], 3", + "vpunpckldq zmm12, zmm20, zmm21", + "vpunpckhdq zmm13, zmm20, zmm21", "vmovdqu32 xmm22, xmmword ptr [rdi + 3 * 16 + 2 * 1024]", "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 6 * 1024], 1", "vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 10 * 1024], 2", @@ -892,21 +906,6 @@ global_asm!( "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 7 * 1024], 1", "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 11 * 1024], 2", "vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 15 * 1024], 3", - // interleave 32 bit words - "vpunpckldq zmm24, zmm8, zmm9", - "vpunpckhdq zmm25, zmm8, zmm9", - "vpunpckldq zmm26, zmm10, zmm11", - "vpunpckhdq zmm27, zmm10, zmm11", - "vpunpckldq zmm28, zmm12, zmm13", - "vpunpckhdq zmm29, zmm12, zmm13", - "vpunpckldq zmm30, zmm14, zmm15", - "vpunpckhdq zmm31, zmm14, zmm15", - "vpunpckldq zmm8, zmm16, zmm17", - "vpunpckhdq zmm9, zmm16, zmm17", - "vpunpckldq zmm10, zmm18, zmm19", - "vpunpckhdq zmm11, zmm18, zmm19", - "vpunpckldq zmm12, zmm20, zmm21", - "vpunpckhdq zmm13, zmm20, zmm21", "vpunpckldq zmm14, zmm22, zmm23", "vpunpckhdq zmm15, zmm22, zmm23", // interleave 64-bit words |
