From 360523bf5230cc5b0bddc1af8201bd001c86bcd3 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sat, 12 Jul 2025 18:19:35 -0700 Subject: Add xof_many_avx512 for Windows --- c/blake3_avx512_x86-64_windows_gnu.S | 49 +++++++++++++++++++++++++++++++ c/blake3_avx512_x86-64_windows_msvc.asm | 51 +++++++++++++++++++++++++++++++++ c/blake3_c_rust_bindings/src/lib.rs | 1 - c/blake3_dispatch.c | 2 +- c/blake3_impl.h | 2 -- src/ffi_avx512.rs | 3 -- src/platform.rs | 1 - 7 files changed, 101 insertions(+), 8 deletions(-) diff --git a/c/blake3_avx512_x86-64_windows_gnu.S b/c/blake3_avx512_x86-64_windows_gnu.S index ba4fc5f..532652e 100644 --- a/c/blake3_avx512_x86-64_windows_gnu.S +++ b/c/blake3_avx512_x86-64_windows_gnu.S @@ -6,6 +6,8 @@ .global _blake3_compress_in_place_avx512 .global blake3_compress_xof_avx512 .global _blake3_compress_xof_avx512 +.global blake3_xof_many_avx512 +.global _blake3_xof_many_avx512 .section .text .p2align 6 @@ -2587,6 +2589,53 @@ blake3_compress_xof_avx512: add rsp, 72 ret +.p2align 6 +_blake3_xof_many_avx512: +blake3_xof_many_avx512: + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + mov rdi, rcx + mov rsi, rdx + movzx r12d, r8b + mov r14, r9 + movzx r13d, byte ptr [rbp+0x68] + mov rbx, qword ptr [rbp+0x70] + mov r15, qword ptr [rbp+0x78] +1: + test r15, r15 + jz 2f + mov rcx, rdi + mov rdx, rsi + mov r8d, r12d + mov r9, r14 + sub rsp, 48 + mov byte ptr [rsp+0x20], r13b + mov qword ptr [rsp+0x28], rbx + call blake3_compress_xof_avx512 + add rsp, 48 + add rbx, 64 + inc r14 + dec r15 + jmp 1b +2: + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret + .section .rdata .p2align 6 INDEX0: diff --git a/c/blake3_avx512_x86-64_windows_msvc.asm b/c/blake3_avx512_x86-64_windows_msvc.asm index b19efba..ea59fde 100644 --- a/c/blake3_avx512_x86-64_windows_msvc.asm +++ b/c/blake3_avx512_x86-64_windows_msvc.asm @@ -4,6 +4,8 @@ public blake3_compress_in_place_avx512 public _blake3_compress_in_place_avx512 public blake3_compress_xof_avx512 public _blake3_compress_xof_avx512 +public blake3_xof_many_avx512 +public _blake3_xof_many_avx512 _TEXT SEGMENT ALIGN(16) 'CODE' @@ -2600,6 +2602,55 @@ _blake3_compress_xof_avx512 PROC _blake3_compress_xof_avx512 ENDP blake3_compress_xof_avx512 ENDP +ALIGN 16 +blake3_xof_many_avx512 PROC +_blake3_xof_many_avx512 PROC + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + mov rdi, rcx + mov rsi, rdx + movzx r12d, r8b + mov r14, r9 + movzx r13d, byte ptr [rbp+68H] + mov rbx, qword ptr [rbp+70H] + mov r15, qword ptr [rbp+78H] +@@loop: + test r15, r15 + jz @@end + mov rcx, rdi + mov rdx, rsi + mov r8d, r12d + mov r9, r14 + sub rsp, 48 + mov byte ptr [rsp+20H], r13b + mov qword ptr [rsp+28H], rbx + call blake3_compress_xof_avx512 + add rsp, 48 + add rbx, 64 + inc r14 + dec r15 + jmp @@loop +@@end: + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +_blake3_xof_many_avx512 ENDP +blake3_xof_many_avx512 ENDP + _TEXT ENDS _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' diff --git a/c/blake3_c_rust_bindings/src/lib.rs b/c/blake3_c_rust_bindings/src/lib.rs index c2b3989..75a0be3 100644 --- a/c/blake3_c_rust_bindings/src/lib.rs +++ b/c/blake3_c_rust_bindings/src/lib.rs @@ -299,7 +299,6 @@ pub mod ffi { flags_end: u8, out: *mut u8, ); - #[cfg(unix)] pub fn blake3_xof_many_avx512( cv: *const u32, block: *const u8, diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index eae7a01..0862ec1 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -235,7 +235,7 @@ void blake3_xof_many(const uint32_t cv[8], #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); -#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512) +#if !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); return; diff --git a/c/blake3_impl.h b/c/blake3_impl.h index facd599..8e0d897 100644 --- a/c/blake3_impl.h +++ b/c/blake3_impl.h @@ -309,14 +309,12 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); -#if !defined(_WIN32) void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t* out, size_t outblocks); #endif #endif -#endif #if BLAKE3_USE_NEON == 1 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, diff --git a/src/ffi_avx512.rs b/src/ffi_avx512.rs index e648eda..dfb779e 100644 --- a/src/ffi_avx512.rs +++ b/src/ffi_avx512.rs @@ -73,7 +73,6 @@ pub unsafe fn hash_many( } // Unsafe because this may only be called on platforms supporting AVX-512. -#[cfg(unix)] pub unsafe fn xof_many( cv: &CVWords, block: &[u8; BLOCK_LEN], @@ -125,7 +124,6 @@ pub mod ffi { flags_end: u8, out: *mut u8, ); - #[cfg(unix)] pub fn blake3_xof_many_avx512( cv: *const u32, block: *const u8, @@ -158,7 +156,6 @@ mod test { crate::test::test_hash_many_fn(hash_many, hash_many); } - #[cfg(unix)] #[test] fn test_xof_many() { if !crate::platform::avx512_detected() { diff --git a/src/platform.rs b/src/platform.rs index 3a05420..4f59b37 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -329,7 +329,6 @@ impl Platform { match self { // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] - #[cfg(unix)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::xof_many(cv, block, block_len, counter, flags, out) -- cgit v1.2.3