aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--c/blake3_avx512_x86-64_windows_gnu.S49
-rw-r--r--c/blake3_avx512_x86-64_windows_msvc.asm51
-rw-r--r--c/blake3_c_rust_bindings/src/lib.rs1
-rw-r--r--c/blake3_dispatch.c2
-rw-r--r--c/blake3_impl.h2
-rw-r--r--src/ffi_avx512.rs3
-rw-r--r--src/platform.rs1
7 files changed, 101 insertions, 8 deletions
diff --git a/c/blake3_avx512_x86-64_windows_gnu.S b/c/blake3_avx512_x86-64_windows_gnu.S
index ba4fc5f..532652e 100644
--- a/c/blake3_avx512_x86-64_windows_gnu.S
+++ b/c/blake3_avx512_x86-64_windows_gnu.S
@@ -6,6 +6,8 @@
.global _blake3_compress_in_place_avx512
.global blake3_compress_xof_avx512
.global _blake3_compress_xof_avx512
+.global blake3_xof_many_avx512
+.global _blake3_xof_many_avx512
.section .text
.p2align 6
@@ -2587,6 +2589,53 @@ blake3_compress_xof_avx512:
add rsp, 72
ret
+.p2align 6
+_blake3_xof_many_avx512:
+blake3_xof_many_avx512:
+ push r15
+ push r14
+ push r13
+ push r12
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ mov rdi, rcx
+ mov rsi, rdx
+ movzx r12d, r8b
+ mov r14, r9
+ movzx r13d, byte ptr [rbp+0x68]
+ mov rbx, qword ptr [rbp+0x70]
+ mov r15, qword ptr [rbp+0x78]
+1:
+ test r15, r15
+ jz 2f
+ mov rcx, rdi
+ mov rdx, rsi
+ mov r8d, r12d
+ mov r9, r14
+ sub rsp, 48
+ mov byte ptr [rsp+0x20], r13b
+ mov qword ptr [rsp+0x28], rbx
+ call blake3_compress_xof_avx512
+ add rsp, 48
+ add rbx, 64
+ inc r14
+ dec r15
+ jmp 1b
+2:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+
.section .rdata
.p2align 6
INDEX0:
diff --git a/c/blake3_avx512_x86-64_windows_msvc.asm b/c/blake3_avx512_x86-64_windows_msvc.asm
index b19efba..ea59fde 100644
--- a/c/blake3_avx512_x86-64_windows_msvc.asm
+++ b/c/blake3_avx512_x86-64_windows_msvc.asm
@@ -4,6 +4,8 @@ public blake3_compress_in_place_avx512
public _blake3_compress_in_place_avx512
public blake3_compress_xof_avx512
public _blake3_compress_xof_avx512
+public blake3_xof_many_avx512
+public _blake3_xof_many_avx512
_TEXT SEGMENT ALIGN(16) 'CODE'
@@ -2600,6 +2602,55 @@ _blake3_compress_xof_avx512 PROC
_blake3_compress_xof_avx512 ENDP
blake3_compress_xof_avx512 ENDP
+ALIGN 16
+blake3_xof_many_avx512 PROC
+_blake3_xof_many_avx512 PROC
+ push r15
+ push r14
+ push r13
+ push r12
+ push rdi
+ push rsi
+ push rbx
+ push rbp
+ mov rbp, rsp
+ mov rdi, rcx
+ mov rsi, rdx
+ movzx r12d, r8b
+ mov r14, r9
+ movzx r13d, byte ptr [rbp+68H]
+ mov rbx, qword ptr [rbp+70H]
+ mov r15, qword ptr [rbp+78H]
+@@loop:
+ test r15, r15
+ jz @@end
+ mov rcx, rdi
+ mov rdx, rsi
+ mov r8d, r12d
+ mov r9, r14
+ sub rsp, 48
+ mov byte ptr [rsp+20H], r13b
+ mov qword ptr [rsp+28H], rbx
+ call blake3_compress_xof_avx512
+ add rsp, 48
+ add rbx, 64
+ inc r14
+ dec r15
+ jmp @@loop
+@@end:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+_blake3_xof_many_avx512 ENDP
+blake3_xof_many_avx512 ENDP
+
_TEXT ENDS
_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
diff --git a/c/blake3_c_rust_bindings/src/lib.rs b/c/blake3_c_rust_bindings/src/lib.rs
index c2b3989..75a0be3 100644
--- a/c/blake3_c_rust_bindings/src/lib.rs
+++ b/c/blake3_c_rust_bindings/src/lib.rs
@@ -299,7 +299,6 @@ pub mod ffi {
flags_end: u8,
out: *mut u8,
);
- #[cfg(unix)]
pub fn blake3_xof_many_avx512(
cv: *const u32,
block: *const u8,
diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c
index eae7a01..0862ec1 100644
--- a/c/blake3_dispatch.c
+++ b/c/blake3_dispatch.c
@@ -235,7 +235,7 @@ void blake3_xof_many(const uint32_t cv[8],
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
-#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
return;
diff --git a/c/blake3_impl.h b/c/blake3_impl.h
index facd599..8e0d897 100644
--- a/c/blake3_impl.h
+++ b/c/blake3_impl.h
@@ -309,14 +309,12 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
-#if !defined(_WIN32)
void blake3_xof_many_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t* out, size_t outblocks);
#endif
#endif
-#endif
#if BLAKE3_USE_NEON == 1
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
diff --git a/src/ffi_avx512.rs b/src/ffi_avx512.rs
index e648eda..dfb779e 100644
--- a/src/ffi_avx512.rs
+++ b/src/ffi_avx512.rs
@@ -73,7 +73,6 @@ pub unsafe fn hash_many<const N: usize>(
}
// Unsafe because this may only be called on platforms supporting AVX-512.
-#[cfg(unix)]
pub unsafe fn xof_many(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
@@ -125,7 +124,6 @@ pub mod ffi {
flags_end: u8,
out: *mut u8,
);
- #[cfg(unix)]
pub fn blake3_xof_many_avx512(
cv: *const u32,
block: *const u8,
@@ -158,7 +156,6 @@ mod test {
crate::test::test_hash_many_fn(hash_many, hash_many);
}
- #[cfg(unix)]
#[test]
fn test_xof_many() {
if !crate::platform::avx512_detected() {
diff --git a/src/platform.rs b/src/platform.rs
index 3a05420..4f59b37 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -329,7 +329,6 @@ impl Platform {
match self {
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
- #[cfg(unix)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::xof_many(cv, block, block_len, counter, flags, out)