aboutsummaryrefslogtreecommitdiff
path: root/c/blake3_avx512.c
AgeCommit message (Collapse)Author
2024-08-15add an intrinsics implementation of blake3_xof_many_avx512Jack O'Connor
2023-05-23Fix typosJoel Rosdahl
2022-11-23fix incorrect output from AVX-512 intrinsics in debug mode under GCC 5.4 and 6.1Jack O'Connor
Fixes https://github.com/BLAKE3-team/BLAKE3/issues/271. The `_mm512_cmp_epu32_mask` intrinsic is broken under GCC 5.4 and 6.1. This led to incorrect output in the AVX-512 implementation when building with intrinsics instead of assembly. This fix is a simplified version of Samuel's proposed fix here: https://github.com/BLAKE3-team/BLAKE3/commit/f10816e857bfd7d695635c6ee8f21b7649bb4e8f#commitcomment-90742995
2022-01-08fix some compiler warningsSamuel Neves
2020-02-13Work around GCC bug 85328 by forcing trivially masked stores.Samuel Neves
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85328 Fixes #58.
2020-01-30[MSVC] added possible to compile at Microsoft Visual C compiler.TheVice
[main.c] removed including of unistd.h from c/main.c file. [blake3_avx2.c|blake3_avx512.c|blake3_sse41.c] resolved compile error: 'C4146' - applying of unary minus operator to the unsigned value.
2020-01-23streamline load_countersSamuel Neves
avx2 before: mov eax, esi neg rax vmovq xmm0, rax vpbroadcastq ymm0, xmm0 vpand ymm0, ymm0, ymmword ptr [rip + .LCPI1_0] vmovq xmm2, rdi vpbroadcastq ymm1, xmm2 vpaddq ymm1, ymm0, ymm1 vmovdqa ymm0, ymmword ptr [rip + .LCPI1_1] # ymm0 = [0,2,4,6,4,6,6,7] vpermd ymm3, ymm0, ymm1 mov r8d, eax and r8d, 5 add r8, rdi mov esi, eax and esi, 6 add rsi, rdi and eax, 7 vpshufd xmm4, xmm3, 231 # xmm4 = xmm3[3,1,2,3] vpinsrd xmm4, xmm4, r8d, 1 add rax, rdi vpinsrd xmm4, xmm4, esi, 2 vpinsrd xmm4, xmm4, eax, 3 vpshufd xmm3, xmm3, 144 # xmm3 = xmm3[0,0,1,2] vpinsrd xmm3, xmm3, edi, 0 vmovdqa xmmword ptr [rdx], xmm3 vmovdqa xmmword ptr [rdx + 16], xmm4 vpermq ymm3, ymm1, 144 # ymm3 = ymm1[0,0,1,2] vpblendd ymm2, ymm3, ymm2, 3 # ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] vpsrlq ymm2, ymm2, 32 vpermd ymm2, ymm0, ymm2 vextracti128 xmm1, ymm1, 1 vmovq xmm3, rax vmovq xmm4, rsi vpunpcklqdq xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0] vmovq xmm4, r8 vpalignr xmm1, xmm4, xmm1, 8 # xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] vinserti128 ymm1, ymm1, xmm3, 1 vpsrlq ymm1, ymm1, 32 vpermd ymm0, ymm0, ymm1 avx2 after: neg esi vmovd xmm0, esi vpbroadcastd ymm0, xmm0 vmovd xmm1, edi vpbroadcastd ymm1, xmm1 vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] vpaddd ymm1, ymm1, ymm0 vpbroadcastd ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] vpor ymm0, ymm0, ymm2 vpxor ymm2, ymm1, ymm2 vpcmpgtd ymm0, ymm0, ymm2 shr rdi, 32 vmovd xmm2, edi vpbroadcastd ymm2, xmm2 vpsubd ymm0, ymm2, ymm0
2020-01-22clang-formatJack O'Connor
2020-01-19manually prefetch message blocksSamuel Neves
2020-01-09merge BLAKE3-c into this repoJack O'Connor
This is commit 4476d9da0e370993823e7ad17592b84e905afd76 of https://github.com/veorq/BLAKE3-c.