diff options
| author | Jack O'Connor <[email protected]> | 2021-11-05 00:17:05 -0400 |
|---|---|---|
| committer | Jack O'Connor <[email protected]> | 2021-11-05 12:18:23 -0400 |
| commit | 682a2f3fe8e3f68f7dd29a6bc8454b768b8a87fd (patch) | |
| tree | 4c262c3a58ac42522002a2a5037cc29f61677c39 /c | |
| parent | 04571021fb5490d0f0008c5b5a968f221de159a0 (diff) | |
fix incorrect output / undefined behavior in Windows SSE2 assemblyfix_sse2
The SSE2 patch introduced xmm10 as a temporary register for one of the
rotations, but xmm6-xmm15 are callee-save registers on Windows, and
SSE4.1 was only saving the registers it used. The minimal fix is to use
one of the saved registers instead of xmm10.
See https://github.com/BLAKE3-team/BLAKE3/issues/206.
Diffstat (limited to 'c')
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 12 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 12 |
2 files changed, 12 insertions, 12 deletions
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index 424b4f8..8852ba5 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2: por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2: por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index ff9bb4d..507502f 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmm6 + movdqa xmm14, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] - pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - por xmm8, xmm10 + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 |
