diff options
| author | silvanshade <[email protected]> | 2025-02-08 05:28:20 -0700 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2025-03-09 16:00:11 -0700 |
| commit | d9b49df0757f8bdfaa542e7181013fbf1555ff89 (patch) | |
| tree | b0b3323bd07e22d39fb032e53b019386be9fb640 | |
| parent | a31e519869d5751370f50c39a99340660ee95bf7 (diff) | |
Implement TBB-based parallelism for C lib
| -rw-r--r-- | .github/workflows/ci.yml | 141 | ||||
| -rw-r--r-- | README.md | 5 | ||||
| -rw-r--r-- | c/CMakeLists.txt | 84 | ||||
| -rw-r--r-- | c/README.md | 126 | ||||
| -rw-r--r-- | c/blake3.c | 63 | ||||
| -rw-r--r-- | c/blake3.h | 4 | ||||
| -rw-r--r-- | c/blake3_impl.h | 29 | ||||
| -rw-r--r-- | c/blake3_tbb.cpp | 37 | ||||
| -rw-r--r-- | c/cmake/BLAKE3/ContinuousIntegration.cmake | 235 | ||||
| -rw-r--r-- | c/cmake/BLAKE3/Examples.cmake | 6 | ||||
| -rw-r--r-- | c/cmake/BLAKE3/Testing.cmake | 3 | ||||
| -rw-r--r-- | c/dependencies/CMakeLists.txt | 3 | ||||
| -rw-r--r-- | c/dependencies/tbb/CMakeLists.txt | 33 |
13 files changed, 667 insertions, 102 deletions
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ce575e..4bb3aaf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -225,54 +225,48 @@ jobs: # Currently only on x86. c_tests: - name: C Makefile tests + name: C tests SIMD=${{ matrix.simd }} TBB=${{ matrix.use_tbb }} runs-on: ubuntu-latest - + strategy: + fail-fast: false + matrix: + use_tbb: ["OFF", "ON"] + simd: ["x86-intrinsics", "amd64-asm"] steps: - uses: actions/checkout@v4 - # Test the intrinsics-based implementations. - - run: make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse2.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse41.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx2.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx512.c - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test - working-directory: ./c - # Test the assembly implementations. - - run: make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse2_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_sse41_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx2_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test_asm - working-directory: ./c - - run: make -f Makefile.testing clean && rm blake3_avx512_x86-64_unix.S - working-directory: ./c - - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test_asm - working-directory: ./c - # Restore the files we deleted above. - - run: git checkout . - # Build the example. - - run: make -f Makefile.testing example - working-directory: ./c + - run: | + sudo apt-get update + sudo apt-get install ninja-build libtbb-dev libtbb12 + # Test the intrinsics-based and assembly-based implementations. + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" + cmake --build c/build --target test + cat c/build/Testing/Temporary/LastTest.log + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" -DBLAKE3_NO_SSE2=1 + cmake --build c/build --target test + cat c/build/Testing/Temporary/LastTest.log + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" -DBLAKE3_NO_SSE2=1 -DBLAKE3_NO_SSE41=1 + cmake --build c/build --target test + cat c/build/Testing/Temporary/LastTest.log + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" "-DBLAKE3_NO_SSE2=1" "-DBLAKE3_NO_SSE41=1" "-DBLAKE3_NO_AVX2=1" + cmake --build c/build --target test + cat c/build/Testing/Temporary/LastTest.log + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" "-DBLAKE3_NO_SSE2=1" "-DBLAKE3_NO_SSE41=1" "-DBLAKE3_NO_AVX2=1" "-DBLAKE3_NO_AVX512=1" + cmake --build c/build --target test + cat c/build/Testing/Temporary/LastTest.log + # Test with TBB disabled/enabled. + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_USE_TBB=${{ matrix.use_tbb }}" + cmake --build c/build --target test + cat c/build/Testing/Temporary/LastTest.log + # Build the example with TBB disabled/enabled. + - run: | + cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON -DBLAKE3_EXAMPLES=ON + cmake --build c/build --target blake3-example # Note that this jobs builds AArch64 binaries from an x86_64 host. build_apple_silicon: @@ -323,28 +317,59 @@ jobs: cd /work ~/.cargo/bin/cargo test --features prefer_intrinsics - # CMake build test (Library only), current macOS/Linux only. + # CMake build test (Library only). cmake_current_build: - name: CMake ${{ matrix.os }} ${{ matrix.compiler }} + name: CMake ${{ matrix.os }} CC=${{ matrix.toolchain.cc }} CXX=${{ matrix.toolchain.cxx }} TBB=${{ matrix.use_tbb }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macOS-latest", "windows-latest"] - compiler: [gcc, clang, cl] + cmakeVersion: [latest] + ninjaVersion: [latest] + os: [ubuntu-latest, macOS-latest, windows-latest] + toolchain: [ + { cc: cl, cxx: cl }, + { cc: clang, cxx: clang++ }, + { cc: clang-cl, cxx: clang-cl }, + { cc: gcc, cxx: g++ }, + ] + use_tbb: [OFF, ON] exclude: - - os: windows-latest - compiler: gcc - - os: ubuntu-latest - compiler: msvc - os: macOS-latest - compiler: msvc + toolchain: { cc: cl, cxx: cl } + - os: macOS-latest + toolchain: { cc: clang-cl, cxx: clang-cl } + - os: ubuntu-latest + toolchain: { cc: cl, cxx: cl } + - os: ubuntu-latest + toolchain: { cc: clang-cl, cxx: clang-cl } + - os: windows-latest + toolchain: { cc: clang, cxx: clang++ } + use_tbb: ON + - os: windows-latest + toolchain: { cc: gcc, cxx: g++ } + use_tbb: ON steps: - uses: actions/checkout@v4 - - name: CMake generation - run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target - - name: CMake build / install - run: cmake --build c/build --target install + - uses: lukka/get-cmake@5f6e04f5267c8133f1273bf2103583fc72c46b17 + with: + cmakeVersion: ${{ matrix.cmakeVersion }} + ninjaVersion: ${{ matrix.ninjaVersion }} + - if: matrix.os == 'macOS-latest' + name: Install dependencies on macOS + run: | + brew update + brew install tbb + - if: matrix.os == 'ubuntu-latest' + name: Install dependencies on Linux + run: | + sudo apt-get update + sudo apt-get install libtbb-dev libtbb12 + - name: CMake generation, build, install + run: | + ${{ matrix.os != 'windows-latest' || '& "C:/Program Files/Microsoft Visual Studio/2022/Enterprise/Common7/Tools/Launch-VsDevShell.ps1" -Arch amd64 -SkipAutomaticLocation' }} + cmake -S c -B c/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/target -DCMAKE_C_COMPILER=${{ matrix.toolchain.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.toolchain.cxx }} -DBLAKE3_USE_TBB=${{ matrix.use_tbb }} -DBLAKE3_FETCH_TBB=-DBLAKE3_FETCH_TBB=${{ matrix.os == 'windows-latest' && 'YES' || 'NO' }} -DBLAKE3_EXAMPLES=ON + cmake --build c/build --target install cmake_3-9_build: name: CMake 3.9.6 ubuntu-latest runs-on: ubuntu-latest @@ -44,9 +44,8 @@ This repository is the official implementation of BLAKE3. It includes: typical desktop hardware. * The [C implementation](c), which like the Rust implementation includes - SIMD code and runtime CPU feature detection on x86. Unlike the Rust - implementation, it's [not currently multithreaded](c#multithreading). See - [`c/README.md`](c/README.md). + SIMD code and runtime CPU feature detection on x86. The `BLAKE3_USE_TBB` + CMAKE option enables multithreading. See [`c/README.md`](c/README.md). * The [Rust reference implementation](reference_impl/reference_impl.rs), which is discussed in Section 5.1 of the [BLAKE3 diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 76c8947..ba88ed3 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -12,12 +12,20 @@ endif() project(libblake3 VERSION 1.6.1 DESCRIPTION "BLAKE3 C implementation" - LANGUAGES C ASM + LANGUAGES C CXX ASM ) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +option(BLAKE3_USE_TBB "Enable oneTBB parallelism" OFF) +option(BLAKE3_FETCH_TBB "Allow fetching oneTBB from GitHub if not found on system" OFF) + +include(CTest) include(FeatureSummary) include(GNUInstallDirs) +add_subdirectory(dependencies) + # architecture lists for which to enable assembly / SIMD sources set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) set(BLAKE3_X86_NAMES i686 x86 X86) @@ -138,6 +146,11 @@ set_target_properties(blake3 PROPERTIES C_EXTENSIONS OFF ) target_compile_features(blake3 PUBLIC c_std_99) +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12) + target_compile_features(blake3 PUBLIC cxx_std_20) + # else: add it further below through `BLAKE3_CMAKE_CXXFLAGS_*` +endif() + # ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD # which may be set by the user or toolchain file if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD) @@ -204,6 +217,67 @@ else() message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'") endif() +if(BLAKE3_USE_TBB) + find_package(TBB 2021.11.0 QUIET) + if(NOT TBB_FOUND AND NOT TARGET TBB::tbb) + message(WARNING + "oneTBB not found; disabling BLAKE3_USE_TBB\n" + "Enable BLAKE3_FETCH_TBB to automatically fetch and build oneTBB" + ) + set(BLAKE3_USE_TBB OFF) + else() + target_sources(blake3 + PRIVATE + blake3_tbb.cpp) + target_link_libraries(blake3 + PUBLIC + # Make shared TBB a transitive dependency. The consuming program is technically not required + # to link TBB in order for libblake3 to function but we do this in order to prevent the + # possibility of multiple separate TBB runtimes being linked into a final program in case + # the consuming program also happens to already use TBB. + TBB::tbb) + target_compile_definitions(blake3 + PUBLIC + BLAKE3_USE_TBB) + endif() +endif() + +if(BLAKE3_USE_TBB) + # Define some scratch variables for building appropriate flags per compiler + if(CMAKE_VERSION VERSION_LESS 3.12) + set(APPEND BLAKE3_CXX_STANDARD_FLAGS_GNU -std=c++20) + set(APPEND BLAKE3_CXX_STANDARD_FLAGS_MSVC /std:c++20) + endif() + set(BLAKE3_CXXFLAGS_GNU "-fno-exceptions;-fno-rtti;${BLAKE3_CXX_STANDARD_FLAGS_GNU}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with GNU-like compiler frontends.") + set(BLAKE3_CXXFLAGS_MSVC "/EHs-c-;/GR;${BLAKE3_CXX_STANDARD_FLAGS_MSVC}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with MSVC-like compiler frontends.") + # Get the C++ compiler name without extension + get_filename_component(BLAKE3_CMAKE_CXX_COMPILER_NAME "${CMAKE_CXX_COMPILER}" NAME_WE) + # Strip any trailing versioning from the C++ compiler name + string(REGEX MATCH "^(clang\\+\\+|clang-cl)" BLAKE3_CMAKE_CXX_COMPILER_NAME "${BLAKE3_CMAKE_CXX_COMPILER_NAME}") + + # TODO: Simplify with CMAKE_CXX_COMPILER_FRONTEND_VARIANT once min CMake version is 3.14. + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") + target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang++") + target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>) + elseif(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang-cl") + target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_MSVC}>) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_MSVC}>) + endif() + + # Undefine scratch variables + unset(BLAKE3_CXX_STANDARD_FLAGS_GNU) + unset(BLAKE3_CXX_STANDARD_FLAGS_MSVC) + unset(BLAKE3_CMAKE_CXX_COMPILER_NAME) + unset(BLAKE3_CXXFLAGS_GNU) + unset(BLAKE3_CXXFLAGS_MSVC) +endif() + # cmake install support install(FILES blake3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") install(TARGETS blake3 EXPORT blake3-targets @@ -267,4 +341,12 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc" add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.") add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.") add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.") +add_feature_info("oneTBB parallelism" BLAKE3_USE_TBB "The library uses oneTBB parallelism.") feature_summary(WHAT ENABLED_FEATURES) + +if(BLAKE3_EXAMPLES) + include(BLAKE3/Examples) +endif() +if(BLAKE3_TESTING) + include(BLAKE3/Testing) +endif() diff --git a/c/README.md b/c/README.md index 965d8c7..c012233 100644 --- a/c/README.md +++ b/c/README.md @@ -84,6 +84,8 @@ Initialize a `blake3_hasher` in the default hashing mode. --- +### Single-threaded update + ```c void blake3_hasher_update( blake3_hasher *self, @@ -91,7 +93,39 @@ void blake3_hasher_update( size_t input_len); ``` -Add input to the hasher. This can be called any number of times. +Add input to the hasher with single-threaded update. This can be called any number of times. + +### Multi-threaded update + +```c +void blake3_hasher_update_tbb( + blake3_hasher *self, + const void *input, + size_t input_len); +``` + +NOTE: This function is only enabled when the library is compiled with CMake option `BLAKE3_USE_TBB` +and when the oneTBB library is detected on the host system. See the building instructions for +further details. + +NOTE: macOS and Linux users should probably install TBB through their package manager but Windows +users, or users with special deployment scenarios, may wish to enable `BLAKE3_FETCH_TBB` in +order to automatically fetch, build, and install TBB directly alongside BLAKE3. + +Add input to the hasher with multi-threaded update. This can be called any number of times. + +This update function uses [oneTBB](https://uxlfoundation.github.io/oneTBB/) task groups +across which the input data is partitioned and dispatched for further processing. + +Input buffers as large as possible should be preferred in order to minimize additional +overhead inherent in coordinating parallel tasks. If the input buffer is too small, +the performance of this update function may be no faster than the single-threaded +implementation or may even be slower. + +This implementation does not require configuration of thread resources and will use as +many cores as possible by default. If the update function is used within the context of a +larger program which uses the [oneTBB] API, more fine-grained control of resources is +possible. --- @@ -184,10 +218,7 @@ void blake3_hasher_reset( Reset the hasher to its initial state, prior to any calls to `blake3_hasher_update`. Currently this is no different from calling -`blake3_hasher_init` or similar again. However, if this implementation gains -multithreading support in the future, and if `blake3_hasher` holds (optional) -threading resources, this function will reuse those resources. Until then, this -is mainly for feature compatibility with the Rust implementation. +`blake3_hasher_init` or similar again. # Security Notes @@ -207,12 +238,57 @@ smell](https://en.wikipedia.org/wiki/Design_smell) in any case. # Building -This implementation is just C and assembly files. It doesn't include a -public-facing build system. (The `Makefile` in this directory is only -for testing.) Instead, the intention is that you can include these files -in whatever build system you're already using. This section describes -the commands your build system should execute, or which you can execute -by hand. Note that these steps may change in future versions. +The easiest and most complete method of compiling the BLAKE3 library is with CMake. + +This is the method described in the next section. + +Toward the end of the building section there are more in depth notes about compiling manually and +things that are useful to understand if you need to adapt the implementation to some pre-existing +custom build system. + +## CMake + +The BLAKE3 library requires a minimum version of CMake 3.9. + +The following invocation will compile and install `libblake3`: + +With recent CMake: + +```bash +cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" +cmake --build c/build --target install +``` + +With an older CMake: + +```bash +cd c +mkdir build +cd build +cmake .. "-DCMAKE_INSTALL_PREFIX=/usr/local" +cmake --build . --target install +``` + +The following options are available when compiling with CMake: + +- `BLAKE3_USE_TBB`: Enable oneTBB parallelism (Requires a C++20 capable compiler) +- `BLAKE3_FETCH_TBB`: Allow fetching oneTBB from GitHub (only if not found on system) +- `BLAKE3_EXAMPLES`: Compile and install example programs + +These can be enabled in the following way: + +```bash +cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" -DCMAKE_USE_TBB=1 -DCMAKE_FETCH_TBB=1 +``` + +## Building manually + +This implementation is mostly C and assembly files with some minor parts in C++ for optional +features. + +The intention is that the implementation is simply enough that it can be easily compiled by hand +without a build system or the sources adapted to whatever custom build system you may happen to be +using without much difficulty. ## x86 @@ -300,6 +376,23 @@ in call to always_inline ‘vaddq_u32’: target specific option mismatch ...then you may need to add something like `-mfpu=neon-vfpv4 -mfloat-abi=hard`. +## oneTBB-based multi-threading + +Optional multi-threading support with performance similar to the Rust Rayon implementation is +available when using the oneTBB library and compiling the optional C++ support file: + +```bash +g++ -c -O3 -fno-exceptions -fno-rtti -DBLAKE3_USE_TBB $(pkg-config --libs --cflags tbb) -o blake3_tbb.o blake3_tbb.cpp +gcc -O3 -o example -lstdc++ -DBLAKE3_USE_TBB $(pkg-config --libs --cflags tbb) blake3_tbb.o \ + example.c blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S +``` + +NOTE: Compiling `blake3_tbb.cpp` with C++ exceptions _disabled_ is required in order to satisfy the +behavior that this implementation expects. The public API methods with external C linkage are marked +`noexcept`. Attempting to compile this file with exceptions _enabled_ will fail and emit a static +assertion message. Compiling with RTTI disabled is not mandatory but recommended for code size. + ## Other Platforms The portable implementation should work on most other architectures. For @@ -308,14 +401,3 @@ example: ```bash gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c ``` - -# Multithreading - -Unlike the Rust implementation, the C implementation doesn't currently support -multithreading. A future version of this library could add support by taking an -optional dependency on OpenMP or similar. Alternatively, we could expose a -lower-level API to allow callers to implement concurrency themselves. The -former would be more convenient and less error-prone, but the latter would give -callers the maximum possible amount of control. The best choice here depends on -the specific use case, so if you have a use case for multithreaded hashing in -C, please file a GitHub issue and let us know. @@ -265,11 +265,10 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multi-threading parallelism for that update(). -static size_t blake3_compress_subtree_wide(const uint8_t *input, - size_t input_len, - const uint32_t key[8], - uint64_t chunk_counter, - uint8_t flags, uint8_t *out) { +size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb) { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. If this implementation adds multi-threading in the future, // this gives us the option of multi-threading even the 2-chunk case, which @@ -303,12 +302,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, } uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; - // Recurse! If this implementation adds multi-threading support in the - // future, this is where it will go. - size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, - chunk_counter, flags, cv_array); - size_t right_n = blake3_compress_subtree_wide( - right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + // Recurse! + size_t left_n = -1; + size_t right_n = -1; + +#if defined(BLAKE3_USE_TBB) + blake3_compress_subtree_wide_join_tbb( + key, flags, use_tbb, + // left-hand side + input, left_input_len, chunk_counter, cv_array, &left_n, + // right-hand side + right_input, right_input_len, right_chunk_counter, right_cvs, &right_n); +#else + left_n = blake3_compress_subtree_wide( + input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb); + right_n = blake3_compress_subtree_wide(right_input, right_input_len, key, + right_chunk_counter, flags, right_cvs, + use_tbb); +#endif // BLAKE3_USE_TBB // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return @@ -334,16 +345,18 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input, // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. -INLINE void compress_subtree_to_parent_node( - const uint8_t *input, size_t input_len, const uint32_t key[8], - uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +INLINE void +compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, + uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN], + bool use_tbb) { #if defined(BLAKE3_TESTING) assert(input_len > BLAKE3_CHUNK_LEN); #endif uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, - chunk_counter, flags, cv_array); + chunk_counter, flags, cv_array, use_tbb); assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because // as we just asserted, num_cvs will always be <=2 in that case. But GCC @@ -459,8 +472,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], self->cv_stack_len += 1; } -void blake3_hasher_update(blake3_hasher *self, const void *input, - size_t input_len) { +INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input, + size_t input_len, bool use_tbb) { // Explicitly checking for zero avoids causing UB by passing a null pointer // to memcpy. This comes up in practice with things like: // std::vector<uint8_t> v; @@ -546,7 +559,7 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, self->chunk.chunk_counter, - self->chunk.flags, cv_pair); + self->chunk.flags, cv_pair, use_tbb); hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); @@ -568,6 +581,20 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, } } +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = false; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} + +#if defined(BLAKE3_USE_TBB) +void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = true; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} +#endif // BLAKE3_USE_TBB + void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len) { blake3_hasher_finalize_seek(self, 0, out, out_len); @@ -69,6 +69,10 @@ BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const voi size_t context_len); BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); +#if defined(BLAKE3_USE_TBB) +BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len); +#endif // BLAKE3_USE_TBB BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, diff --git a/c/blake3_impl.h b/c/blake3_impl.h index 51d792a..facd599 100644 --- a/c/blake3_impl.h +++ b/c/blake3_impl.h @@ -9,6 +9,10 @@ #include "blake3.h" +#ifdef __cplusplus +extern "C" { +#endif + // internal flags enum blake3_flags { CHUNK_START = 1 << 0, @@ -28,6 +32,12 @@ enum blake3_flags { #define INLINE static inline __attribute__((always_inline)) #endif +#ifdef __cplusplus +#define NOEXCEPT noexcept +#else +#define NOEXCEPT +#endif + #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC) #define IS_X86 #define IS_X86_64 @@ -210,6 +220,22 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blake3_simd_degree(void); +BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb); + +#if defined(BLAKE3_USE_TBB) +BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb( + // shared params + const uint32_t key[8], uint8_t flags, bool use_tbb, + // left-hand side params + const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, + uint8_t *l_cvs, size_t *l_n, + // right-hand side params + const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, + uint8_t *r_cvs, size_t *r_n) NOEXCEPT; +#endif // Declarations for implementation-specific functions. void blake3_compress_in_place_portable(uint32_t cv[8], @@ -300,5 +326,8 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, uint8_t flags_end, uint8_t *out); #endif +#ifdef __cplusplus +} +#endif #endif /* BLAKE3_IMPL_H */ diff --git a/c/blake3_tbb.cpp b/c/blake3_tbb.cpp new file mode 100644 index 0000000..c2bc2db --- /dev/null +++ b/c/blake3_tbb.cpp @@ -0,0 +1,37 @@ +#include <cstddef> +#include <cstdint> + +#include <oneapi/tbb/parallel_invoke.h> + +#include "blake3_impl.h" + +static_assert(TBB_USE_EXCEPTIONS == 0, + "This file should be compiled with C++ exceptions disabled."); + +extern "C" void blake3_compress_subtree_wide_join_tbb( + // shared params + const uint32_t key[8], uint8_t flags, bool use_tbb, + // left-hand side params + const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter, + uint8_t *l_cvs, size_t *l_n, + // right-hand side params + const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter, + uint8_t *r_cvs, size_t *r_n) noexcept { + if (!use_tbb) { + *l_n = blake3_compress_subtree_wide(l_input, l_input_len, key, + l_chunk_counter, flags, l_cvs, use_tbb); + *r_n = blake3_compress_subtree_wide(r_input, r_input_len, key, + r_chunk_counter, flags, r_cvs, use_tbb); + return; + } + + oneapi::tbb::parallel_invoke( + [=]() { + *l_n = blake3_compress_subtree_wide( + l_input, l_input_len, key, l_chunk_counter, flags, l_cvs, use_tbb); + }, + [=]() { + *r_n = blake3_compress_subtree_wide( + r_input, r_input_len, key, r_chunk_counter, flags, r_cvs, use_tbb); + }); +} diff --git a/c/cmake/BLAKE3/ContinuousIntegration.cmake b/c/cmake/BLAKE3/ContinuousIntegration.cmake new file mode 100644 index 0000000..57a1f18 --- /dev/null +++ b/c/cmake/BLAKE3/ContinuousIntegration.cmake @@ -0,0 +1,235 @@ +cmake_minimum_required(VERSION 3.13 FATAL_ERROR) + +if(BUILD_SHARED_LIBS) + message(FATAL_ERROR "BUILD_SHARED_LIBS is incompatible with BLAKE3_TESTING_CI") +endif() + +include(CTest) + +# Declare a testing specific variant of the `blake3` library target. +# +# We use a separate library target in order to be able to perform compilation with various +# combinations of features which are too noisy to specify in the main CMake config as options for +# the normal `blake3` target. +# +# Initially this target has no properties but eventually we will populate them by copying all of the +# relevant properties from the normal `blake3` target. +add_library(blake3-testing + blake3.c + blake3_dispatch.c + blake3_portable.c +) + +if(BLAKE3_USE_TBB AND TBB_FOUND) + target_sources(blake3-testing + PRIVATE + blake3_tbb.cpp) +endif() + +if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm") + # Conditionally add amd64 asm files to `blake3-testing` sources + if(MSVC) + if(NOT BLAKE3_NO_AVX2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm) + endif() + if(NOT BLAKE3_NO_AVX512) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_msvc.asm) + endif() + if(NOT BLAKE3_NO_SSE2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_msvc.asm) + endif() + if(NOT BLAKE3_NO_SSE41) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_msvc.asm) + endif() + elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" + OR CMAKE_C_COMPILER_ID STREQUAL "Clang" + OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") + if (WIN32) + if(NOT BLAKE3_NO_AVX2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S) + endif() + if(NOT BLAKE3_NO_AVX512) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_gnu.S) + endif() + if(NOT BLAKE3_NO_SSE2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_gnu.S) + endif() + if(NOT BLAKE3_NO_SSE41) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_gnu.S) + endif() + elseif(UNIX) + if(NOT BLAKE3_NO_AVX2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S) + endif() + if(NOT BLAKE3_NO_AVX512) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_unix.S) + endif() + if(NOT BLAKE3_NO_SSE2) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_unix.S) + endif() + if(NOT BLAKE3_NO_SSE41) + list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_unix.S) + endif() + endif() + endif() + target_sources(blake3-testing PRIVATE ${BLAKE3_AMD64_ASM_SOURCES}) +elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics") + # Conditionally add amd64 C files to `blake3-testing` sources + if (NOT DEFINED BLAKE3_CFLAGS_SSE2 + OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1 + OR NOT DEFINED BLAKE3_CFLAGS_AVX2 + OR NOT DEFINED BLAKE3_CFLAGS_AVX512) + message(WARNING "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.") + else() + set(BLAKE3_SIMD_X86_INTRINSICS ON) + endif() + + if(NOT BLAKE3_NO_AVX2) + target_sources(blake3-testing PRIVATE blake3_avx2.c) + set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}") + endif() + if(NOT BLAKE3_NO_AVX512) + target_sources(blake3-testing PRIVATE blake3_avx512.c) + set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}") + endif() + if(NOT BLAKE3_NO_SSE2) + target_sources(blake3-testing PRIVATE blake3_sse2.c) + set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") + endif() + if(NOT BLAKE3_NO_SSE41) + target_sources(blake3-testing PRIVATE blake3_sse41.c) + set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") + endif() + +elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics") + # Conditionally add neon C files to `blake3-testing` sources + + target_sources(blake3-testing PRIVATE + blake3_neon.c + ) + target_compile_definitions(blake3-testing PRIVATE + BLAKE3_USE_NEON=1 + ) + + if (DEFINED BLAKE3_CFLAGS_NEON) + set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") + endif() + +elseif(BLAKE3_SIMD_TYPE STREQUAL "none") + # Disable neon if simd type is "none". We check for individual amd64 features further below. + + target_compile_definitions(blake3-testing PRIVATE + BLAKE3_USE_NEON=0 + ) + +endif() + +if(BLAKE3_NO_AVX2) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX2) +endif() +if(BLAKE3_NO_AVX512) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX512) +endif() +if(BLAKE3_NO_SSE2) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE2) +endif() +if(BLAKE3_NO_SSE41) + target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE41) +endif() + +target_compile_definitions(blake3-testing PUBLIC BLAKE3_TESTING) + +get_target_property(BLAKE3_COMPILE_DEFINITIONS blake3 COMPILE_DEFINITIONS) +if(BLAKE3_COMPILE_DEFINITIONS) + target_compile_definitions(blake3-testing PUBLIC + ${BLAKE3_COMPILE_DEFINITIONS}) +endif() + +get_target_property(BLAKE3_COMPILE_OPTIONS blake3 COMPILE_OPTIONS) +if(BLAKE3_COMPILE_OPTIONS) + target_compile_options(blake3-testing PRIVATE + ${BLAKE3_COMPILE_OPTIONS} + -O3 + -Wall + -Wextra + -pedantic + -fstack-protector-strong + -D_FORTIFY_SOURCE=2 + -fPIE + -fvisibility=hidden + -fsanitize=address,undefined + ) +endif() + +get_target_property(BLAKE3_INCLUDE_DIRECTORIES blake3 INCLUDE_DIRECTORIES) +if(BLAKE3_INCLUDE_DIRECTORIES) + target_include_directories(blake3-testing PUBLIC + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> + ) +endif() + +get_target_property(BLAKE3_LINK_LIBRARIES blake3 LINK_LIBRARIES) +if(BLAKE3_LINK_LIBRARIES) + target_link_libraries(blake3-testing PRIVATE ${BLAKE3_LINK_LIBRARIES}) +endif() + +get_target_property(BLAKE3_LINK_OPTIONS blake3 LINK_OPTIONS) +if(BLAKE3_LINK_OPTIONS) + target_link_options(blake3-testing PRIVATE + ${BLAKE3_LINK_OPTIONS} + -fsanitize=address,undefined + -pie + -Wl,-z,relro,-z,now + ) +endif() + +# test asm target +add_executable(blake3-asm-test + main.c +) +set_target_properties(blake3-asm-test PROPERTIES + OUTPUT_NAME blake3 + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}) +target_link_libraries(blake3-asm-test PRIVATE blake3-testing) +target_compile_definitions(blake3-asm-test PRIVATE BLAKE3_TESTING) +target_compile_options(blake3-asm-test PRIVATE + -O3 + -Wall + -Wextra + -pedantic + -fstack-protector-strong + -D_FORTIFY_SOURCE=2 + -fPIE + -fvisibility=hidden + -fsanitize=address,undefined +) +target_link_options(blake3-asm-test PRIVATE + -fsanitize=address,undefined + -pie + -Wl,-z,relro,-z,now +) + +add_test(NAME blake3-testing + COMMAND "${CMAKE_CTEST_COMMAND}" + --verbose + --extra-verbose + --build-and-test "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" + --build-generator "${CMAKE_GENERATOR}" + --build-makeprogram "${CMAKE_MAKE_PROGRAM}" + --build-project libblake3 + --build-target blake3-asm-test + --build-options + --fresh + "-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}" + "-DBLAKE3_TESTING=${BLAKE3_TESTING}" + "-DBLAKE3_TESTING_CI=${BLAKE3_TESTING_CI}" + "-DBLAKE3_USE_TBB=${BLAKE3_USE_TBB}" + "-DBLAKE3_SIMD_TYPE=${BLAKE3_SIMD_TYPE}" + "-DBLAKE3_NO_SSE2=${BLAKE3_NO_SSE2}" + "-DBLAKE3_NO_SSE41=${BLAKE3_NO_SSE41}" + "-DBLAKE3_NO_AVX2=${BLAKE3_NO_AVX2}" + "-DBLAKE3_NO_AVX512=${BLAKE3_NO_AVX512}" + --test-command + "${CMAKE_SOURCE_DIR}/test.py" + ) diff --git a/c/cmake/BLAKE3/Examples.cmake b/c/cmake/BLAKE3/Examples.cmake new file mode 100644 index 0000000..8911820 --- /dev/null +++ b/c/cmake/BLAKE3/Examples.cmake @@ -0,0 +1,6 @@ +if(NOT WIN32) + add_executable(blake3-example + example.c) + target_link_libraries(blake3-example PRIVATE blake3) + install(TARGETS blake3-example) +endif() diff --git a/c/cmake/BLAKE3/Testing.cmake b/c/cmake/BLAKE3/Testing.cmake new file mode 100644 index 0000000..1c22baa --- /dev/null +++ b/c/cmake/BLAKE3/Testing.cmake @@ -0,0 +1,3 @@ +if(BLAKE3_TESTING_CI) + include(BLAKE3/ContinuousIntegration) +endif()
\ No newline at end of file diff --git a/c/dependencies/CMakeLists.txt b/c/dependencies/CMakeLists.txt new file mode 100644 index 0000000..4382353 --- /dev/null +++ b/c/dependencies/CMakeLists.txt @@ -0,0 +1,3 @@ +if(BLAKE3_USE_TBB) + add_subdirectory(tbb) +endif() diff --git a/c/dependencies/tbb/CMakeLists.txt b/c/dependencies/tbb/CMakeLists.txt new file mode 100644 index 0000000..0f51395 --- /dev/null +++ b/c/dependencies/tbb/CMakeLists.txt @@ -0,0 +1,33 @@ +if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + message(FATAL_ERROR "BLAKE3_USE_TBB requires building with Clang or MSVC on Windows") + set(BLAKE3_USE_TBB OFF) +endif() + +find_package(TBB 2021.11.0 QUIET) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.11) + include(FetchContent) + + if(NOT TBB_FOUND AND BLAKE3_FETCH_TBB) + set(CMAKE_C_STANDARD 99) + set(CMAKE_C_EXTENSIONS OFF) + + set(CMAKE_CXX_STANDARD 20) + set(CMAKE_CXX_EXTENSIONS ON) + + option(TBB_TEST OFF "") + option(TBBMALLOC_BUILD OFF "") + + mark_as_advanced(TBB_TEST) + mark_as_advanced(TBBMALLOC_BUILD) + + FetchContent_Declare( + TBB + GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB + GIT_TAG 0c0ff192a2304e114bc9e6557582dfba101360ff # v2022.0.0 + GIT_SHALLOW TRUE + ) + + FetchContent_MakeAvailable(TBB) + endif() +endif() |
