]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant
authorMartin Willi <martin@strongswan.org>
Sun, 11 Nov 2018 09:36:25 +0000 (10:36 +0100)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 16 Nov 2018 06:11:04 +0000 (14:11 +0800)
Add a length argument to the single block function for SSSE3, so the
block function may XOR only a partial length of the full block. Given
that the setup code is rather cheap, the function does not process more
than one block; this allows us to keep the block function selection in
the C glue code.

The required branching does not negatively affect performance for full
block sizes. The partial XORing uses simple "rep movsb" to copy the
data before and after doing XOR in SSE. This is rather efficient on
modern processors; movsw can be slightly faster, but the additional
complexity is probably not worth it.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/chacha20-ssse3-x86_64.S
arch/x86/crypto/chacha20_glue.c

index 512a2b500fd1813d1ffc4d74053ec137ecf865c5..98d130b5e4ab5d8e7fa2d007fc2ecfbbb01eb167 100644 (file)
@@ -25,12 +25,13 @@ CTRINC:     .octa 0x00000003000000020000000100000000
 
 ENTRY(chacha20_block_xor_ssse3)
        # %rdi: Input state matrix, s
-       # %rsi: 1 data block output, o
-       # %rdx: 1 data block input, i
+       # %rsi: up to 1 data block output, o
+       # %rdx: up to 1 data block input, i
+       # %rcx: input/output length in bytes
 
        # This function encrypts one ChaCha20 block by loading the state matrix
        # in four SSE registers. It performs matrix operation on four words in
-       # parallel, but requireds shuffling to rearrange the words after each
+       # parallel, but requires shuffling to rearrange the words after each
        # round. 8/16-bit word rotation is done with the slightly better
        # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
        # traditional shift+OR.
@@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3)
        movdqa          ROT8(%rip),%xmm4
        movdqa          ROT16(%rip),%xmm5
 
-       mov     $10,%ecx
+       mov             %rcx,%rax
+       mov             $10,%ecx
 
 .Ldoubleround:
 
@@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3)
        jnz             .Ldoubleround
 
        # o0 = i0 ^ (x0 + s0)
-       movdqu          0x00(%rdx),%xmm4
        paddd           %xmm8,%xmm0
+       cmp             $0x10,%rax
+       jl              .Lxorpart
+       movdqu          0x00(%rdx),%xmm4
        pxor            %xmm4,%xmm0
        movdqu          %xmm0,0x00(%rsi)
        # o1 = i1 ^ (x1 + s1)
-       movdqu          0x10(%rdx),%xmm5
        paddd           %xmm9,%xmm1
-       pxor            %xmm5,%xmm1
-       movdqu          %xmm1,0x10(%rsi)
+       movdqa          %xmm1,%xmm0
+       cmp             $0x20,%rax
+       jl              .Lxorpart
+       movdqu          0x10(%rdx),%xmm0
+       pxor            %xmm1,%xmm0
+       movdqu          %xmm0,0x10(%rsi)
        # o2 = i2 ^ (x2 + s2)
-       movdqu          0x20(%rdx),%xmm6
        paddd           %xmm10,%xmm2
-       pxor            %xmm6,%xmm2
-       movdqu          %xmm2,0x20(%rsi)
+       movdqa          %xmm2,%xmm0
+       cmp             $0x30,%rax
+       jl              .Lxorpart
+       movdqu          0x20(%rdx),%xmm0
+       pxor            %xmm2,%xmm0
+       movdqu          %xmm0,0x20(%rsi)
        # o3 = i3 ^ (x3 + s3)
-       movdqu          0x30(%rdx),%xmm7
        paddd           %xmm11,%xmm3
-       pxor            %xmm7,%xmm3
-       movdqu          %xmm3,0x30(%rsi)
-
+       movdqa          %xmm3,%xmm0
+       cmp             $0x40,%rax
+       jl              .Lxorpart
+       movdqu          0x30(%rdx),%xmm0
+       pxor            %xmm3,%xmm0
+       movdqu          %xmm0,0x30(%rsi)
+
+.Ldone:
        ret
+
+.Lxorpart:
+       # xor remaining bytes from partial register into output
+       mov             %rax,%r9
+       and             $0x0f,%r9
+       jz              .Ldone
+       and             $~0x0f,%rax
+
+       mov             %rsi,%r11
+
+       lea             8(%rsp),%r10
+       sub             $0x10,%rsp
+       and             $~31,%rsp
+
+       lea             (%rdx,%rax),%rsi
+       mov             %rsp,%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       pxor            0x00(%rsp),%xmm0
+       movdqa          %xmm0,0x00(%rsp)
+
+       mov             %rsp,%rsi
+       lea             (%r11,%rax),%rdi
+       mov             %r9,%rcx
+       rep movsb
+
+       lea             -8(%r10),%rsp
+       jmp             .Ldone
+
 ENDPROC(chacha20_block_xor_ssse3)
 
 ENTRY(chacha20_4block_xor_ssse3)
index dce7c5d39c2f26f8e1cf6d31c69e1ef627f6cb10..cc4571736ce8396f805ead132d3e8cd80798de14 100644 (file)
@@ -19,7 +19,8 @@
 
 #define CHACHA20_STATE_ALIGN 16
 
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+                                        unsigned int len);
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
@@ -29,8 +30,6 @@ static bool chacha20_use_avx2;
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
                            unsigned int bytes)
 {
-       u8 buf[CHACHA20_BLOCK_SIZE];
-
 #ifdef CONFIG_AS_AVX2
        if (chacha20_use_avx2) {
                while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
@@ -50,16 +49,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
                state[12] += 4;
        }
        while (bytes >= CHACHA20_BLOCK_SIZE) {
-               chacha20_block_xor_ssse3(state, dst, src);
+               chacha20_block_xor_ssse3(state, dst, src, bytes);
                bytes -= CHACHA20_BLOCK_SIZE;
                src += CHACHA20_BLOCK_SIZE;
                dst += CHACHA20_BLOCK_SIZE;
                state[12]++;
        }
        if (bytes) {
-               memcpy(buf, src, bytes);
-               chacha20_block_xor_ssse3(state, buf, buf);
-               memcpy(dst, buf, bytes);
+               chacha20_block_xor_ssse3(state, dst, src, bytes);
        }
 }