diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-16 15:36:30 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-24 00:57:39 -0400 |
| commit | 1ef915dbea5038811ce0a8e2effd138b13afbc2a (patch) | |
| tree | 870cf7c850858130c52fabc2d85c84a19ec35c61 | |
| parent | e632967a8da6205b118bb5921eb89f617ae8a12c (diff) | |
C: asm: emulate pinsrd using SSE2 instructions
Use two pinsrw and a 16-bit shift to insert the 32-bit integer at the desired location.
* c/blake3_sse2_x86-64_unix.S: emulate pinsrd using SSE2 instructions for x86_64 unix
* c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU.
* c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 43 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 43 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 43 |
3 files changed, 102 insertions, 27 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index 9382b8f..e033793 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -1460,12 +1460,24 @@ blake3_hash_many_sse2: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + mov eax, dword ptr [rsp+0x120] + sar eax, 16 + pinsrw xmm13, word ptr [rsp+0x120], 2 + pinsrw xmm13, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 + pinsrw xmm13, eax, 5 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm14, word ptr [rsp+0x124], 2 + pinsrw xmm14, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4 + pinsrw xmm14, eax, 5 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] @@ -1508,8 +1520,12 @@ blake3_hash_many_sse2: pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 + mov r14d, eax + sar r14d, 16 + pinsrw xmm3, eax, 6 + pinsrw xmm3, r14d, 7 + pinsrw xmm11, eax, 6 + pinsrw xmm11, r14d, 7 mov al, 7 9: paddd xmm0, xmm4 @@ -1715,8 +1731,14 @@ blake3_hash_many_sse2: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + mov eax, dword ptr [rsp+0x120] + sar eax, 16 + pinsrw xmm13, word ptr [rsp+0x120], 2 + pinsrw xmm13, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 + pinsrw xmm13, eax, 5 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] @@ -1731,7 +1753,10 @@ blake3_hash_many_sse2: cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 + mov r14d, eax + sar r14d, 16 + pinsrw xmm3, eax, 6 + pinsrw xmm3, r14d, 7 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index 638e683..82e27ad 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1471,12 +1471,24 @@ blake3_hash_many_sse2: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + mov eax, dword ptr [rsp+0x120] + sar eax, 16 + pinsrw xmm13, word ptr [rsp+0x120], 2 + pinsrw xmm13, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 + pinsrw xmm13, eax, 5 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] - pinsrd xmm14, dword ptr [rsp+0x124], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm14, word ptr [rsp+0x124], 2 + pinsrw xmm14, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4 + pinsrw xmm14, eax, 5 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] @@ -1519,8 +1531,12 @@ blake3_hash_many_sse2: pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 + mov r14d, eax + sar r14d, 16 + pinsrw xmm3, eax, 6 + pinsrw xmm3, r14d, 7 + pinsrw xmm11, eax, 6 + pinsrw xmm11, r14d, 7 mov al, 7 9: paddd xmm0, xmm4 @@ -1726,8 +1742,14 @@ blake3_hash_many_sse2: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] - pinsrd xmm13, dword ptr [rsp+0x120], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + mov eax, dword ptr [rsp+0x120] + sar eax, 16 + pinsrw xmm13, word ptr [rsp+0x120], 2 + pinsrw xmm13, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip] + sar eax, 16 + pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4 + pinsrw xmm13, eax, 5 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] @@ -1742,7 +1764,10 @@ blake3_hash_many_sse2: cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 + mov r14d, eax + sar r14d, 16 + pinsrw xmm3, eax, 6 + pinsrw xmm3, r14d, 7 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index 41507a0..2d3900f 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -1472,12 +1472,24 @@ final3blocks: movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+110H] - pinsrd xmm13, dword ptr [rsp+120H], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + mov eax, dword ptr [rsp+120H] + sar eax, 16 + pinsrw xmm13, word ptr [rsp+120H], 2 + pinsrw xmm13, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN] + sar eax, 16 + pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4 + pinsrw xmm13, eax, 5 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+114H] - pinsrd xmm14, dword ptr [rsp+124H], 1 - pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + mov eax, dword ptr [BLAKE3_BLOCK_LEN] + sar eax, 16 + pinsrw xmm14, word ptr [rsp+124H], 2 + pinsrw xmm14, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN] + sar eax, 16 + pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN], 4 + pinsrw xmm14, eax, 5 movaps xmmword ptr [rsp+10H], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+8H] @@ -1520,8 +1532,12 @@ innerloop2: pshufd xmm15, xmm11, 93H movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+10H] - pinsrd xmm3, eax, 3 - pinsrd xmm11, eax, 3 + mov r14d, eax + sar r14d, 16 + pinsrw xmm3, eax, 6 + pinsrw xmm3, r14d, 7 + pinsrw xmm11, eax, 6 + pinsrw xmm11, r14d, 7 mov al, 7 roundloop2: paddd xmm0, xmm4 @@ -1727,8 +1743,14 @@ final1block: movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+10H] movd xmm13, dword ptr [rsp+110H] - pinsrd xmm13, dword ptr [rsp+120H], 1 - pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + mov eax, dword ptr [rsp+120H] + sar eax, 16 + pinsrw xmm13, word ptr [rsp+120H], 2 + pinsrw xmm13, eax, 3 + mov eax, dword ptr [BLAKE3_BLOCK_LEN] + sar eax, 16 + pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4 + pinsrw xmm13, eax, 5 movaps xmm14, xmmword ptr [ROT8] movaps xmm15, xmmword ptr [ROT16] mov r8, qword ptr [rdi] @@ -1743,7 +1765,10 @@ innerloop1: cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV] movaps xmm3, xmm13 - pinsrd xmm3, eax, 3 + mov r14d, eax + sar r14d, 16 + pinsrw xmm3, eax, 6 + pinsrw xmm3, r14d, 7 movups xmm4, xmmword ptr [r8+rdx-40H] movups xmm5, xmmword ptr [r8+rdx-30H] movaps xmm8, xmm4 |
