diff options
| author | Jack O'Connor <[email protected]> | 2023-07-05 10:28:07 -0700 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2023-07-05 10:29:02 -0700 |
| commit | f7e1a7429ff7727144a67a06ac210d2831f392a2 (patch) | |
| tree | 87fd6a9afa96ed75369f7d8061719d850d975c79 | |
| parent | 7038dad280eb5dcd622bf54336207b683388e8cc (diff) | |
retain the old NEON rotations in inline comments
| -rw-r--r-- | c/blake3_neon.c | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/c/blake3_neon.c b/c/blake3_neon.c index 1d4559e..8a818fc 100644 --- a/c/blake3_neon.c +++ b/c/blake3_neon.c @@ -36,14 +36,22 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { } INLINE uint32x4_t rot16_128(uint32x4_t x) { + // The straightfoward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); } INLINE uint32x4_t rot12_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); } INLINE uint32x4_t rot8_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); #if defined(__clang__) return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); #elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 @@ -55,6 +63,8 @@ INLINE uint32x4_t rot8_128(uint32x4_t x) { } INLINE uint32x4_t rot7_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); } |
