aboutsummaryrefslogtreecommitdiff
path: root/c/blake3_sse2_x86-64_windows_gnu.S
diff options
context:
space:
mode:
authorJack O'Connor <[email protected]>2021-11-05 00:17:05 -0400
committerJack O'Connor <[email protected]>2021-11-05 12:25:44 -0400
commit371b5483c95be1e0250c5209d68a8536406152de (patch)
tree00bc4905f45079f9ba02cedb84e1d230eccf8ca3 /c/blake3_sse2_x86-64_windows_gnu.S
parent04571021fb5490d0f0008c5b5a968f221de159a0 (diff)
fix incorrect output / undefined behavior in Windows SSE2 assembly
The SSE2 patch introduced xmm10 as a temporary register for one of the rotations, but xmm6-xmm15 are callee-save registers on Windows, and SSE4.1 was only saving the registers it used. The minimal fix is to use one of the saved registers instead of xmm10. See https://github.com/BLAKE3-team/BLAKE3/issues/206.
Diffstat (limited to 'c/blake3_sse2_x86-64_windows_gnu.S')
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S12
1 files changed, 6 insertions, 6 deletions
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 424b4f8..8852ba5 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
por xmm9, xmm8
movdqa xmm8, xmm7
punpcklqdq xmm8, xmm5
- movdqa xmm10, xmm6
+ movdqa xmm14, xmm6
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
- por xmm8, xmm10
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm14
pshufd xmm8, xmm8, 0x78
punpckhdq xmm5, xmm7
punpckldq xmm6, xmm5