diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-23 21:12:38 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-24 00:55:06 -0400 |
| commit | 40a4a2b6b016e999d6dae2b0eb67f08aad6150bf (patch) | |
| tree | 934a3e6038248af3a73792e4d71b72f6958ab25e | |
| parent | d91f20dd29e491b70d0fb900ff3445f53add50a3 (diff) | |
SSE2 intrinsic: emulate _mm_blend_epi16 SSE4.1 intrinsic with SSE2 intrinsics
Use a constant mask to blend according to (mask & b) | ((~mask) & a).
* src/rust_sse2.rs: emulate _mm_blend_epi16 using SSE2 intrinsics
* c/blake3_sse2.c: Likewise.
| -rw-r--r-- | c/blake3_sse2.c | 31 | ||||
| -rw-r--r-- | src/rust_sse2.rs | 32 |
2 files changed, 39 insertions, 24 deletions
diff --git a/c/blake3_sse2.c b/c/blake3_sse2.c index a6a1a33..35936dc 100644 --- a/c/blake3_sse2.c +++ b/c/blake3_sse2.c @@ -80,6 +80,13 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); } +INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) { + __m128i mask = _mm_set1_epi16(imm8); + mask = _mm_mullo_epi16(mask, set4(0x40008000, 0x10002000, 0x04000800, 0x01000200)); + mask = _mm_srai_epi16(mask, 15); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +} + INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { @@ -122,11 +129,11 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -145,11 +152,11 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -168,11 +175,11 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -191,11 +198,11 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -214,11 +221,11 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -237,11 +244,11 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); diagonalize(&rows[0], &rows[2], &rows[3]); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); t3 = _mm_unpackhi_epi32(m1, m3); diff --git a/src/rust_sse2.rs b/src/rust_sse2.rs index 3313fc7..3084ed1 100644 --- a/src/rust_sse2.rs +++ b/src/rust_sse2.rs @@ -138,6 +138,14 @@ unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m12 } #[inline(always)] +unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let mut mask = _mm_set1_epi16(imm8 as i16); + mask = _mm_mullo_epi16(mask, set4(0x40008000, 0x10002000, 0x04000800, 0x01000200)); + mask = _mm_srai_epi16(mask, 15); + _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) +} + +#[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], @@ -192,11 +200,11 @@ unsafe fn compress_pre( g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -215,11 +223,11 @@ unsafe fn compress_pre( g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -238,11 +246,11 @@ unsafe fn compress_pre( g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -261,11 +269,11 @@ unsafe fn compress_pre( g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -284,11 +292,11 @@ unsafe fn compress_pre( g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); @@ -307,11 +315,11 @@ unsafe fn compress_pre( g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); + t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); + tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); |
