diff options
| author | Matthew Krupcale <[email protected]> | 2020-08-31 11:36:01 -0400 |
|---|---|---|
| committer | Matthew Krupcale <[email protected]> | 2020-08-31 12:12:42 -0400 |
| commit | be2da69b6b293764867c42fcbc278627271d9710 (patch) | |
| tree | 939daaec31af7466103b34e920a822dca6487b1a | |
| parent | 47e415c7f19d97b3a39720f9c892288e82d4bd99 (diff) | |
C: asm: simplify pblendw emulation
Use statically calculated ~mask. This reduces the number of moves and registers necessary at the expense of an extra memory load. This is probably a good trade-off since we are not bound by memory uops in this loop.
| -rw-r--r-- | c/blake3_sse2_x86-64_unix.S | 97 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_gnu.S | 97 | ||||
| -rw-r--r-- | c/blake3_sse2_x86-64_windows_msvc.asm | 97 |
3 files changed, 108 insertions, 183 deletions
diff --git a/c/blake3_sse2_x86-64_unix.S b/c/blake3_sse2_x86-64_unix.S index 8b26125..245c519 100644 --- a/c/blake3_sse2_x86-64_unix.S +++ b/c/blake3_sse2_x86-64_unix.S @@ -1833,24 +1833,17 @@ blake3_hash_many_sse2: pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 - movdqa xmmword ptr [rsp+0x20], xmm2 - movdqa xmmword ptr [rsp+0x40], xmm3 - movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm13 - pand xmm3, xmm12 - movdqa xmm13, xmm3 - por xmm13, xmm2 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 - movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm12 - pand xmm3, xmm6 - movdqa xmm12, xmm3 + movdqa xmmword ptr [rsp+0x20], xmm2 + movdqa xmm2, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm2 movdqa xmm2, xmmword ptr [rsp+0x20] - movdqa xmm3, xmmword ptr [rsp+0x40] pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1864,24 +1857,17 @@ blake3_hash_many_sse2: pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 - movdqa xmmword ptr [rsp+0x30], xmm2 - movdqa xmmword ptr [rsp+0x50], xmm3 - movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm6 - pand xmm3, xmm5 - movdqa xmm6, xmm3 - por xmm6, xmm2 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 - movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm5 - pand xmm3, xmm14 - movdqa xmm5, xmm3 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] - movdqa xmm3, xmmword ptr [rsp+0x50] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 @@ -2013,19 +1999,14 @@ blake3_hash_many_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 @@ -2133,19 +2114,14 @@ _blake3_compress_in_place_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 @@ -2252,19 +2228,14 @@ _blake3_compress_xof_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 @@ -2312,7 +2283,11 @@ BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/c/blake3_sse2_x86-64_windows_gnu.S b/c/blake3_sse2_x86-64_windows_gnu.S index b2ee40d..0800f4a 100644 --- a/c/blake3_sse2_x86-64_windows_gnu.S +++ b/c/blake3_sse2_x86-64_windows_gnu.S @@ -1844,24 +1844,17 @@ blake3_hash_many_sse2: pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 - movdqa xmmword ptr [rsp+0x20], xmm2 - movdqa xmmword ptr [rsp+0x40], xmm3 - movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm13 - pand xmm3, xmm12 - movdqa xmm13, xmm3 - por xmm13, xmm2 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 - movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm12 - pand xmm3, xmm6 - movdqa xmm12, xmm3 + movdqa xmmword ptr [rsp+0x20], xmm2 + movdqa xmm2, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm2 movdqa xmm2, xmmword ptr [rsp+0x20] - movdqa xmm3, xmmword ptr [rsp+0x40] pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1875,24 +1868,17 @@ blake3_hash_many_sse2: pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 - movdqa xmmword ptr [rsp+0x30], xmm2 - movdqa xmmword ptr [rsp+0x50], xmm3 - movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm6 - pand xmm3, xmm5 - movdqa xmm6, xmm3 - por xmm6, xmm2 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 - movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm3, xmm2 - pandn xmm2, xmm5 - pand xmm3, xmm14 - movdqa xmm5, xmm3 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] - movdqa xmm3, xmmword ptr [rsp+0x50] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 @@ -2024,19 +2010,14 @@ blake3_hash_many_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 @@ -2153,19 +2134,14 @@ _blake3_compress_in_place_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 @@ -2289,19 +2265,14 @@ blake3_compress_xof_sse2: pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 @@ -2353,7 +2324,11 @@ BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/c/blake3_sse2_x86-64_windows_msvc.asm b/c/blake3_sse2_x86-64_windows_msvc.asm index 70a3044..0a2d9cb 100644 --- a/c/blake3_sse2_x86-64_windows_msvc.asm +++ b/c/blake3_sse2_x86-64_windows_msvc.asm @@ -1845,24 +1845,17 @@ roundloop2: pshufd xmm4, xmm12, 39H movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 - movdqa xmmword ptr [rsp+20H], xmm2 - movdqa xmmword ptr [rsp+40H], xmm3 - movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK] - movdqa xmm3, xmm2 - pandn xmm2, xmm13 - pand xmm3, xmm12 - movdqa xmm13, xmm3 - por xmm13, xmm2 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm13, xmm12 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 - movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK] - movdqa xmm3, xmm2 - pandn xmm2, xmm12 - pand xmm3, xmm6 - movdqa xmm12, xmm3 + movdqa xmmword ptr [rsp+20H], xmm2 + movdqa xmm2, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] por xmm12, xmm2 movdqa xmm2, xmmword ptr [rsp+20H] - movdqa xmm3, xmmword ptr [rsp+40H] pshufd xmm12, xmm12, 78H punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 @@ -1876,24 +1869,17 @@ roundloop2: pshufd xmm12, xmm5, 39H movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 - movdqa xmmword ptr [rsp+30H], xmm2 - movdqa xmmword ptr [rsp+50H], xmm3 - movdqa xmm2, xmmword ptr [PBLENDW_0xCC_MASK] - movdqa xmm3, xmm2 - pandn xmm2, xmm6 - pand xmm3, xmm5 - movdqa xmm6, xmm3 - por xmm6, xmm2 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 - movdqa xmm2, xmmword ptr [PBLENDW_0xC0_MASK] - movdqa xmm3, xmm2 - pandn xmm2, xmm5 - pand xmm3, xmm14 - movdqa xmm5, xmm3 + movdqa xmmword ptr [rsp+30H], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+30H] - movdqa xmm3, xmmword ptr [rsp+50H] pshufd xmm5, xmm5, 78H punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 @@ -2025,19 +2011,14 @@ roundloop1: pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 @@ -2155,19 +2136,14 @@ _blake3_compress_in_place_sse2 PROC pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 @@ -2292,19 +2268,14 @@ _blake3_compress_xof_sse2 PROC pshufd xmm4, xmm8, 39H movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 - movdqa xmm10, xmmword ptr [PBLENDW_0xCC_MASK] - movdqa xmm11, xmm10 - pandn xmm10, xmm9 - pand xmm11, xmm8 - movdqa xmm9, xmm11 - por xmm9, xmm10 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 - movdqa xmm10, xmmword ptr [PBLENDW_0xC0_MASK] - movdqa xmm11, xmm10 - pandn xmm10, xmm8 - pand xmm11, xmm6 - movdqa xmm8, xmm11 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] por xmm8, xmm10 pshufd xmm8, xmm8, 78H punpckhdq xmm5, xmm7 @@ -2368,8 +2339,12 @@ BLAKE3_BLOCK_LEN: CMP_MSB_MASK: dd 8 dup(80000000H) +PBLENDW_0x33_MASK: + dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H PBLENDW_0xCC_MASK: dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH +PBLENDW_0x3F_MASK: + dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H PBLENDW_0xC0_MASK: dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH |
