diff options
| author | Samuel Neves <[email protected]> | 2020-01-19 18:41:55 +0000 |
|---|---|---|
| committer | Samuel Neves <[email protected]> | 2020-01-19 18:45:37 +0000 |
| commit | b8c33e11ef4a85a0d88743cb7f00b66c2c9fc538 (patch) | |
| tree | 66af9a0d64fd6e373ff9e3d7f176c6249af631c5 | |
| parent | a3147eb90921094a3b3a71f0db0308c9fe09a6f1 (diff) | |
manually prefetch message blocks
| -rw-r--r-- | c/blake3_avx2.c | 3 | ||||
| -rw-r--r-- | c/blake3_avx512.c | 9 | ||||
| -rw-r--r-- | c/blake3_sse41.c | 3 | ||||
| -rw-r--r-- | src/avx2.rs | 3 | ||||
| -rw-r--r-- | src/sse41.rs | 3 |
5 files changed, 21 insertions, 0 deletions
diff --git a/c/blake3_avx2.c b/c/blake3_avx2.c index 0300505..a0c2c5a 100644 --- a/c/blake3_avx2.c +++ b/c/blake3_avx2.c @@ -213,6 +213,9 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for(size_t i = 0; i < 8; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } transpose_vecs(&out[0]); transpose_vecs(&out[8]); } diff --git a/c/blake3_avx512.c b/c/blake3_avx512.c index fc754e2..3e07ecb 100644 --- a/c/blake3_avx512.c +++ b/c/blake3_avx512.c @@ -467,6 +467,9 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for(size_t i = 0; i < 4; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } transpose_vecs_128(&out[0]); transpose_vecs_128(&out[4]); transpose_vecs_128(&out[8]); @@ -720,6 +723,9 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for(size_t i = 0; i < 8; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } transpose_vecs_256(&out[0]); transpose_vecs_256(&out[8]); } @@ -1030,6 +1036,9 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, out[13] = loadu_512(&inputs[13][block_offset]); out[14] = loadu_512(&inputs[14][block_offset]); out[15] = loadu_512(&inputs[15][block_offset]); + for(size_t i = 0; i < 16; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } transpose_vecs_512(out); } diff --git a/c/blake3_sse41.c b/c/blake3_sse41.c index 3bf281f..3b6d168 100644 --- a/c/blake3_sse41.c +++ b/c/blake3_sse41.c @@ -428,6 +428,9 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for(size_t i = 0; i < 4; ++i) { + _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0); + } transpose_vecs(&out[0]); transpose_vecs(&out[4]); transpose_vecs(&out[8]); diff --git a/src/avx2.rs b/src/avx2.rs index 841b6e1..7f36072 100644 --- a/src/avx2.rs +++ b/src/avx2.rs @@ -261,6 +261,9 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0); + } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); diff --git a/src/sse41.rs b/src/sse41.rs index 349c74d..fcf2f98 100644 --- a/src/sse41.rs +++ b/src/sse41.rs @@ -512,6 +512,9 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as * const i8, _MM_HINT_T0); + } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); |
