aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Krupcale <[email protected]>2020-08-16 15:36:30 -0400
committerMatthew Krupcale <[email protected]>2020-08-24 00:57:39 -0400
commit1ef915dbea5038811ce0a8e2effd138b13afbc2a (patch)
tree870cf7c850858130c52fabc2d85c84a19ec35c61
parente632967a8da6205b118bb5921eb89f617ae8a12c (diff)
C: asm: emulate pinsrd using SSE2 instructions
Use two pinsrw and a 16-bit shift to insert the 32-bit integer at the desired location. * c/blake3_sse2_x86-64_unix.S: emulate pinsrd using SSE2 instructions for x86_64 unix * c/blake3_sse2_x86-64_windows_gnu.S: Likewise for x86_64 Windows GNU. * c/blake3_sse2_x86-64_windows_msvc.asm: Likewise for x86_64 Windows MSVC.
-rw-r--r--c/blake3_sse2_x86-64_unix.S43
-rw-r--r--c/blake3_sse2_x86-64_windows_gnu.S43
-rw-r--r--c/blake3_sse2_x86-64_windows_msvc.asm43
3 files changed, 102 insertions, 27 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S
index 9382b8f..e033793 100644
--- a/c/blake3_sse2_x86-64_unix.S
+++ b/c/blake3_sse2_x86-64_unix.S
@@ -1460,12 +1460,24 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ mov eax, dword ptr [rsp+0x120]
+ sar eax, 16
+ pinsrw xmm13, word ptr [rsp+0x120], 2
+ pinsrw xmm13, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
+ pinsrw xmm13, eax, 5
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
- pinsrd xmm14, dword ptr [rsp+0x124], 1
- pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm14, word ptr [rsp+0x124], 2
+ pinsrw xmm14, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
+ pinsrw xmm14, eax, 5
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@@ -1508,8 +1520,12 @@ blake3_hash_many_sse2:
pshufd xmm15, xmm11, 0x93
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
- pinsrd xmm3, eax, 3
- pinsrd xmm11, eax, 3
+ mov r14d, eax
+ sar r14d, 16
+ pinsrw xmm3, eax, 6
+ pinsrw xmm3, r14d, 7
+ pinsrw xmm11, eax, 6
+ pinsrw xmm11, r14d, 7
mov al, 7
9:
paddd xmm0, xmm4
@@ -1715,8 +1731,14 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ mov eax, dword ptr [rsp+0x120]
+ sar eax, 16
+ pinsrw xmm13, word ptr [rsp+0x120], 2
+ pinsrw xmm13, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
+ pinsrw xmm13, eax, 5
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
@@ -1731,7 +1753,10 @@ blake3_hash_many_sse2:
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
- pinsrd xmm3, eax, 3
+ mov r14d, eax
+ sar r14d, 16
+ pinsrw xmm3, eax, 6
+ pinsrw xmm3, r14d, 7
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S
index 638e683..82e27ad 100644
--- a/c/blake3_sse2_x86-64_windows_gnu.S
+++ b/c/blake3_sse2_x86-64_windows_gnu.S
@@ -1471,12 +1471,24 @@ blake3_hash_many_sse2:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ mov eax, dword ptr [rsp+0x120]
+ sar eax, 16
+ pinsrw xmm13, word ptr [rsp+0x120], 2
+ pinsrw xmm13, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
+ pinsrw xmm13, eax, 5
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+0x114]
- pinsrd xmm14, dword ptr [rsp+0x124], 1
- pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm14, word ptr [rsp+0x124], 2
+ pinsrw xmm14, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN+rip], 4
+ pinsrw xmm14, eax, 5
movaps xmmword ptr [rsp+0x10], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
@@ -1519,8 +1531,12 @@ blake3_hash_many_sse2:
pshufd xmm15, xmm11, 0x93
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+0x10]
- pinsrd xmm3, eax, 3
- pinsrd xmm11, eax, 3
+ mov r14d, eax
+ sar r14d, 16
+ pinsrw xmm3, eax, 6
+ pinsrw xmm3, r14d, 7
+ pinsrw xmm11, eax, 6
+ pinsrw xmm11, r14d, 7
mov al, 7
9:
paddd xmm0, xmm4
@@ -1726,8 +1742,14 @@ blake3_hash_many_sse2:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+0x10]
movd xmm13, dword ptr [rsp+0x110]
- pinsrd xmm13, dword ptr [rsp+0x120], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ mov eax, dword ptr [rsp+0x120]
+ sar eax, 16
+ pinsrw xmm13, word ptr [rsp+0x120], 2
+ pinsrw xmm13, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ sar eax, 16
+ pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN+rip], 4
+ pinsrw xmm13, eax, 5
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
mov r8, qword ptr [rdi]
@@ -1742,7 +1764,10 @@ blake3_hash_many_sse2:
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
movaps xmm3, xmm13
- pinsrd xmm3, eax, 3
+ mov r14d, eax
+ sar r14d, 16
+ pinsrw xmm3, eax, 6
+ pinsrw xmm3, r14d, 7
movups xmm4, xmmword ptr [r8+rdx-0x40]
movups xmm5, xmmword ptr [r8+rdx-0x30]
movaps xmm8, xmm4
diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm
index 41507a0..2d3900f 100644
--- a/c/blake3_sse2_x86-64_windows_msvc.asm
+++ b/c/blake3_sse2_x86-64_windows_msvc.asm
@@ -1472,12 +1472,24 @@ final3blocks:
movaps xmm8, xmm0
movaps xmm9, xmm1
movd xmm13, dword ptr [rsp+110H]
- pinsrd xmm13, dword ptr [rsp+120H], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+ mov eax, dword ptr [rsp+120H]
+ sar eax, 16
+ pinsrw xmm13, word ptr [rsp+120H], 2
+ pinsrw xmm13, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN]
+ sar eax, 16
+ pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
+ pinsrw xmm13, eax, 5
movaps xmmword ptr [rsp], xmm13
movd xmm14, dword ptr [rsp+114H]
- pinsrd xmm14, dword ptr [rsp+124H], 1
- pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN]
+ sar eax, 16
+ pinsrw xmm14, word ptr [rsp+124H], 2
+ pinsrw xmm14, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN]
+ sar eax, 16
+ pinsrw xmm14, word ptr [BLAKE3_BLOCK_LEN], 4
+ pinsrw xmm14, eax, 5
movaps xmmword ptr [rsp+10H], xmm14
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+8H]
@@ -1520,8 +1532,12 @@ innerloop2:
pshufd xmm15, xmm11, 93H
movaps xmm3, xmmword ptr [rsp]
movaps xmm11, xmmword ptr [rsp+10H]
- pinsrd xmm3, eax, 3
- pinsrd xmm11, eax, 3
+ mov r14d, eax
+ sar r14d, 16
+ pinsrw xmm3, eax, 6
+ pinsrw xmm3, r14d, 7
+ pinsrw xmm11, eax, 6
+ pinsrw xmm11, r14d, 7
mov al, 7
roundloop2:
paddd xmm0, xmm4
@@ -1727,8 +1743,14 @@ final1block:
movups xmm0, xmmword ptr [rcx]
movups xmm1, xmmword ptr [rcx+10H]
movd xmm13, dword ptr [rsp+110H]
- pinsrd xmm13, dword ptr [rsp+120H], 1
- pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
+ mov eax, dword ptr [rsp+120H]
+ sar eax, 16
+ pinsrw xmm13, word ptr [rsp+120H], 2
+ pinsrw xmm13, eax, 3
+ mov eax, dword ptr [BLAKE3_BLOCK_LEN]
+ sar eax, 16
+ pinsrw xmm13, word ptr [BLAKE3_BLOCK_LEN], 4
+ pinsrw xmm13, eax, 5
movaps xmm14, xmmword ptr [ROT8]
movaps xmm15, xmmword ptr [ROT16]
mov r8, qword ptr [rdi]
@@ -1743,7 +1765,10 @@ innerloop1:
cmovne eax, r14d
movaps xmm2, xmmword ptr [BLAKE3_IV]
movaps xmm3, xmm13
- pinsrd xmm3, eax, 3
+ mov r14d, eax
+ sar r14d, 16
+ pinsrw xmm3, eax, 6
+ pinsrw xmm3, r14d, 7
movups xmm4, xmmword ptr [r8+rdx-40H]
movups xmm5, xmmword ptr [r8+rdx-30H]
movaps xmm8, xmm4