aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2022-03-08 12:03:03 -0500
committerJack O'Connor <[email protected]>2022-03-08 22:23:09 -0500
commit78b8e87f91a1c9253a8de2f088d09f5d0c15d990 (patch)
treefeb1556f7607be5681503eca5172289da76cf2d1
parent87a9318233fe8c5347453afecaa69242e09c929c (diff)
interleave the first pass -- good performance
-rw-r--r--src/kernel.rs29
1 files changed, 14 insertions, 15 deletions
diff --git a/src/kernel.rs b/src/kernel.rs
index 8336661..09860e4 100644
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -836,6 +836,8 @@ global_asm!(
"vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 5 * 1024], 1",
"vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 9 * 1024], 2",
"vinserti32x4 zmm9, zmm9, xmmword ptr [rdi + 0 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm24, zmm8, zmm9",
+ "vpunpckhdq zmm25, zmm8, zmm9",
"vmovdqu32 xmm10, xmmword ptr [rdi + 0 * 16 + 2 * 1024]",
"vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 6 * 1024], 1",
"vinserti32x4 zmm10, zmm10, xmmword ptr [rdi + 0 * 16 + 10 * 1024], 2",
@@ -844,6 +846,8 @@ global_asm!(
"vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 7 * 1024], 1",
"vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 11 * 1024], 2",
"vinserti32x4 zmm11, zmm11, xmmword ptr [rdi + 0 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm26, zmm10, zmm11",
+ "vpunpckhdq zmm27, zmm10, zmm11",
"vmovdqu32 xmm12, xmmword ptr [rdi + 1 * 16 + 0 * 1024]",
"vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 4 * 1024], 1",
"vinserti32x4 zmm12, zmm12, xmmword ptr [rdi + 1 * 16 + 8 * 1024], 2",
@@ -852,6 +856,8 @@ global_asm!(
"vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 5 * 1024], 1",
"vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 9 * 1024], 2",
"vinserti32x4 zmm13, zmm13, xmmword ptr [rdi + 1 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm28, zmm12, zmm13",
+ "vpunpckhdq zmm29, zmm12, zmm13",
"vmovdqu32 xmm14, xmmword ptr [rdi + 1 * 16 + 2 * 1024]",
"vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 6 * 1024], 1",
"vinserti32x4 zmm14, zmm14, xmmword ptr [rdi + 1 * 16 + 10 * 1024], 2",
@@ -860,6 +866,8 @@ global_asm!(
"vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 7 * 1024], 1",
"vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 11 * 1024], 2",
"vinserti32x4 zmm15, zmm15, xmmword ptr [rdi + 1 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm30, zmm14, zmm15",
+ "vpunpckhdq zmm31, zmm14, zmm15",
"vmovdqu32 xmm16, xmmword ptr [rdi + 2 * 16 + 0 * 1024]",
"vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 4 * 1024], 1",
"vinserti32x4 zmm16, zmm16, xmmword ptr [rdi + 2 * 16 + 8 * 1024], 2",
@@ -868,6 +876,8 @@ global_asm!(
"vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 5 * 1024], 1",
"vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 9 * 1024], 2",
"vinserti32x4 zmm17, zmm17, xmmword ptr [rdi + 2 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm8, zmm16, zmm17",
+ "vpunpckhdq zmm9, zmm16, zmm17",
"vmovdqu32 xmm18, xmmword ptr [rdi + 2 * 16 + 2 * 1024]",
"vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 6 * 1024], 1",
"vinserti32x4 zmm18, zmm18, xmmword ptr [rdi + 2 * 16 + 10 * 1024], 2",
@@ -876,6 +886,8 @@ global_asm!(
"vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 7 * 1024], 1",
"vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 11 * 1024], 2",
"vinserti32x4 zmm19, zmm19, xmmword ptr [rdi + 2 * 16 + 15 * 1024], 3",
+ "vpunpckldq zmm10, zmm18, zmm19",
+ "vpunpckhdq zmm11, zmm18, zmm19",
"vmovdqu32 xmm20, xmmword ptr [rdi + 3 * 16 + 0 * 1024]",
"vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 4 * 1024], 1",
"vinserti32x4 zmm20, zmm20, xmmword ptr [rdi + 3 * 16 + 8 * 1024], 2",
@@ -884,6 +896,8 @@ global_asm!(
"vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 5 * 1024], 1",
"vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 9 * 1024], 2",
"vinserti32x4 zmm21, zmm21, xmmword ptr [rdi + 3 * 16 + 13 * 1024], 3",
+ "vpunpckldq zmm12, zmm20, zmm21",
+ "vpunpckhdq zmm13, zmm20, zmm21",
"vmovdqu32 xmm22, xmmword ptr [rdi + 3 * 16 + 2 * 1024]",
"vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 6 * 1024], 1",
"vinserti32x4 zmm22, zmm22, xmmword ptr [rdi + 3 * 16 + 10 * 1024], 2",
@@ -892,21 +906,6 @@ global_asm!(
"vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 7 * 1024], 1",
"vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 11 * 1024], 2",
"vinserti32x4 zmm23, zmm23, xmmword ptr [rdi + 3 * 16 + 15 * 1024], 3",
- // interleave 32 bit words
- "vpunpckldq zmm24, zmm8, zmm9",
- "vpunpckhdq zmm25, zmm8, zmm9",
- "vpunpckldq zmm26, zmm10, zmm11",
- "vpunpckhdq zmm27, zmm10, zmm11",
- "vpunpckldq zmm28, zmm12, zmm13",
- "vpunpckhdq zmm29, zmm12, zmm13",
- "vpunpckldq zmm30, zmm14, zmm15",
- "vpunpckhdq zmm31, zmm14, zmm15",
- "vpunpckldq zmm8, zmm16, zmm17",
- "vpunpckhdq zmm9, zmm16, zmm17",
- "vpunpckldq zmm10, zmm18, zmm19",
- "vpunpckhdq zmm11, zmm18, zmm19",
- "vpunpckldq zmm12, zmm20, zmm21",
- "vpunpckhdq zmm13, zmm20, zmm21",
"vpunpckldq zmm14, zmm22, zmm23",
"vpunpckhdq zmm15, zmm22, zmm23",
// interleave 64-bit words