aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2023-07-05 10:28:07 -0700
committerJack O'Connor <[email protected]>2023-07-05 10:29:02 -0700
commitf7e1a7429ff7727144a67a06ac210d2831f392a2 (patch)
tree87fd6a9afa96ed75369f7d8061719d850d975c79
parent7038dad280eb5dcd622bf54336207b683388e8cc (diff)
retain the old NEON rotations in inline comments
-rw-r--r--c/blake3_neon.c10
1 files changed, 10 insertions, 0 deletions
diff --git a/c/blake3_neon.c b/c/blake3_neon.c
index 1d4559e..8a818fc 100644
--- a/c/blake3_neon.c
+++ b/c/blake3_neon.c
@@ -36,14 +36,22 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
}
INLINE uint32x4_t rot16_128(uint32x4_t x) {
+ // The straightfoward implementation would be two shifts and an or, but that's
+ // slower on microarchitectures we've tested. See
+ // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+ // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
}
INLINE uint32x4_t rot12_128(uint32x4_t x) {
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
}
INLINE uint32x4_t rot8_128(uint32x4_t x) {
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
#if defined(__clang__)
return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
@@ -55,6 +63,8 @@ INLINE uint32x4_t rot8_128(uint32x4_t x) {
}
INLINE uint32x4_t rot7_128(uint32x4_t x) {
+ // See comment in rot16_128.
+ // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
}