aboutsummaryrefslogtreecommitdiff
path: root/c
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2021-11-05 00:17:05 -0400
committerJack O'Connor <[email protected]>2021-11-05 12:18:23 -0400
commit682a2f3fe8e3f68f7dd29a6bc8454b768b8a87fd (patch)
tree4c262c3a58ac42522002a2a5037cc29f61677c39 /c
parent04571021fb5490d0f0008c5b5a968f221de159a0 (diff)
fix incorrect output / undefined behavior in Windows SSE2 assemblyfix_sse2
The SSE2 patch introduced xmm10 as a temporary register for one of the rotations, but xmm6-xmm15 are callee-save registers on Windows, and SSE4.1 was only saving the registers it used. The minimal fix is to use one of the saved registers instead of xmm10. See https://github.com/BLAKE3-team/BLAKE3/issues/206.
Diffstat (limited to 'c')
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S12
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm12
2 files changed, 12 insertions, 12 deletions
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 424b4f8..8852ba5 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index ff9bb4d..507502f 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 78H
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5