2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
14 .section .rodata.cst16.ROT8, "aM", @progbits, 16
16 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
17 .section .rodata.cst16.ROT16, "aM", @progbits, 16
19 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
20 .section .rodata.cst16.CTRINC, "aM", @progbits, 16
22 CTRINC: .octa 0x00000003000000020000000100000000
26 ENTRY(chacha20_block_xor_ssse3)
27 # %rdi: Input state matrix, s
28 # %rsi: up to 1 data block output, o
29 # %rdx: up to 1 data block input, i
30 # %rcx: input/output length in bytes
32 # This function encrypts one ChaCha20 block by loading the state matrix
33 # in four SSE registers. It performs matrix operation on four words in
34 # parallel, but requires shuffling to rearrange the words after each
35 # round. 8/16-bit word rotation is done with the slightly better
36 # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
37 # traditional shift+OR.
40 movdqa 0x00(%rdi),%xmm0
41 movdqa 0x10(%rdi),%xmm1
42 movdqa 0x20(%rdi),%xmm2
43 movdqa 0x30(%rdi),%xmm3
49 movdqa ROT8(%rip),%xmm4
50 movdqa ROT16(%rip),%xmm5
57 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
62 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
70 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
75 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
83 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
84 pshufd $0x39,%xmm1,%xmm1
85 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
86 pshufd $0x4e,%xmm2,%xmm2
87 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
88 pshufd $0x93,%xmm3,%xmm3
90 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
95 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
103 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
108 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
116 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
117 pshufd $0x93,%xmm1,%xmm1
118 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
119 pshufd $0x4e,%xmm2,%xmm2
120 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
121 pshufd $0x39,%xmm3,%xmm3
126 # o0 = i0 ^ (x0 + s0)
130 movdqu 0x00(%rdx),%xmm4
132 movdqu %xmm0,0x00(%rsi)
133 # o1 = i1 ^ (x1 + s1)
138 movdqu 0x10(%rdx),%xmm0
140 movdqu %xmm0,0x10(%rsi)
141 # o2 = i2 ^ (x2 + s2)
146 movdqu 0x20(%rdx),%xmm0
148 movdqu %xmm0,0x20(%rsi)
149 # o3 = i3 ^ (x3 + s3)
154 movdqu 0x30(%rdx),%xmm0
156 movdqu %xmm0,0x30(%rsi)
162 # xor remaining bytes from partial register into output
179 pxor 0x00(%rsp),%xmm0
180 movdqa %xmm0,0x00(%rsp)
190 ENDPROC(chacha20_block_xor_ssse3)
192 ENTRY(chacha20_4block_xor_ssse3)
193 # %rdi: Input state matrix, s
194 # %rsi: 4 data blocks output, o
195 # %rdx: 4 data blocks input, i
197 # This function encrypts four consecutive ChaCha20 blocks by loading the
198 # the state matrix in SSE registers four times. As we need some scratch
199 # registers, we save the first four registers on the stack. The
200 # algorithm performs each operation on the corresponding word of each
201 # state matrix, hence requires no word shuffling. For final XORing step
202 # we transpose the matrix by interleaving 32- and then 64-bit words,
203 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
204 # done with the slightly better performing SSSE3 byte shuffling,
205 # 7/12-bit word rotation uses traditional shift+OR.
211 # x0..15[0-3] = s0..3[0..3]
212 movq 0x00(%rdi),%xmm1
213 pshufd $0x00,%xmm1,%xmm0
214 pshufd $0x55,%xmm1,%xmm1
215 movq 0x08(%rdi),%xmm3
216 pshufd $0x00,%xmm3,%xmm2
217 pshufd $0x55,%xmm3,%xmm3
218 movq 0x10(%rdi),%xmm5
219 pshufd $0x00,%xmm5,%xmm4
220 pshufd $0x55,%xmm5,%xmm5
221 movq 0x18(%rdi),%xmm7
222 pshufd $0x00,%xmm7,%xmm6
223 pshufd $0x55,%xmm7,%xmm7
224 movq 0x20(%rdi),%xmm9
225 pshufd $0x00,%xmm9,%xmm8
226 pshufd $0x55,%xmm9,%xmm9
227 movq 0x28(%rdi),%xmm11
228 pshufd $0x00,%xmm11,%xmm10
229 pshufd $0x55,%xmm11,%xmm11
230 movq 0x30(%rdi),%xmm13
231 pshufd $0x00,%xmm13,%xmm12
232 pshufd $0x55,%xmm13,%xmm13
233 movq 0x38(%rdi),%xmm15
234 pshufd $0x00,%xmm15,%xmm14
235 pshufd $0x55,%xmm15,%xmm15
237 movdqa %xmm0,0x00(%rsp)
238 movdqa %xmm1,0x10(%rsp)
239 movdqa %xmm2,0x20(%rsp)
240 movdqa %xmm3,0x30(%rsp)
242 movdqa CTRINC(%rip),%xmm1
243 movdqa ROT8(%rip),%xmm2
244 movdqa ROT16(%rip),%xmm3
246 # x12 += counter values 0-3
252 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
253 movdqa 0x00(%rsp),%xmm0
255 movdqa %xmm0,0x00(%rsp)
258 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
259 movdqa 0x10(%rsp),%xmm0
261 movdqa %xmm0,0x10(%rsp)
264 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
265 movdqa 0x20(%rsp),%xmm0
267 movdqa %xmm0,0x20(%rsp)
270 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
271 movdqa 0x30(%rsp),%xmm0
273 movdqa %xmm0,0x30(%rsp)
277 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
284 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
291 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
298 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
306 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
307 movdqa 0x00(%rsp),%xmm0
309 movdqa %xmm0,0x00(%rsp)
312 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
313 movdqa 0x10(%rsp),%xmm0
315 movdqa %xmm0,0x10(%rsp)
318 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
319 movdqa 0x20(%rsp),%xmm0
321 movdqa %xmm0,0x20(%rsp)
324 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
325 movdqa 0x30(%rsp),%xmm0
327 movdqa %xmm0,0x30(%rsp)
331 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
338 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
345 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
352 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
360 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
361 movdqa 0x00(%rsp),%xmm0
363 movdqa %xmm0,0x00(%rsp)
366 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
367 movdqa 0x10(%rsp),%xmm0
369 movdqa %xmm0,0x10(%rsp)
372 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
373 movdqa 0x20(%rsp),%xmm0
375 movdqa %xmm0,0x20(%rsp)
378 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379 movdqa 0x30(%rsp),%xmm0
381 movdqa %xmm0,0x30(%rsp)
385 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
392 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
399 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
406 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
414 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
415 movdqa 0x00(%rsp),%xmm0
417 movdqa %xmm0,0x00(%rsp)
420 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
421 movdqa 0x10(%rsp),%xmm0
423 movdqa %xmm0,0x10(%rsp)
426 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
427 movdqa 0x20(%rsp),%xmm0
429 movdqa %xmm0,0x20(%rsp)
432 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
433 movdqa 0x30(%rsp),%xmm0
435 movdqa %xmm0,0x30(%rsp)
439 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
446 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
453 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
460 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
473 movq 0x00(%rdi),%xmm3
474 pshufd $0x00,%xmm3,%xmm2
475 pshufd $0x55,%xmm3,%xmm3
476 paddd 0x00(%rsp),%xmm2
477 movdqa %xmm2,0x00(%rsp)
478 paddd 0x10(%rsp),%xmm3
479 movdqa %xmm3,0x10(%rsp)
482 movq 0x08(%rdi),%xmm3
483 pshufd $0x00,%xmm3,%xmm2
484 pshufd $0x55,%xmm3,%xmm3
485 paddd 0x20(%rsp),%xmm2
486 movdqa %xmm2,0x20(%rsp)
487 paddd 0x30(%rsp),%xmm3
488 movdqa %xmm3,0x30(%rsp)
492 movq 0x10(%rdi),%xmm3
493 pshufd $0x00,%xmm3,%xmm2
494 pshufd $0x55,%xmm3,%xmm3
499 movq 0x18(%rdi),%xmm3
500 pshufd $0x00,%xmm3,%xmm2
501 pshufd $0x55,%xmm3,%xmm3
507 movq 0x20(%rdi),%xmm3
508 pshufd $0x00,%xmm3,%xmm2
509 pshufd $0x55,%xmm3,%xmm3
514 movq 0x28(%rdi),%xmm3
515 pshufd $0x00,%xmm3,%xmm2
516 pshufd $0x55,%xmm3,%xmm3
522 movq 0x30(%rdi),%xmm3
523 pshufd $0x00,%xmm3,%xmm2
524 pshufd $0x55,%xmm3,%xmm3
529 movq 0x38(%rdi),%xmm3
530 pshufd $0x00,%xmm3,%xmm2
531 pshufd $0x55,%xmm3,%xmm3
535 # x12 += counter values 0-3
538 # interleave 32-bit words in state n, n+1
539 movdqa 0x00(%rsp),%xmm0
540 movdqa 0x10(%rsp),%xmm1
542 punpckldq %xmm1,%xmm2
543 punpckhdq %xmm1,%xmm0
544 movdqa %xmm2,0x00(%rsp)
545 movdqa %xmm0,0x10(%rsp)
546 movdqa 0x20(%rsp),%xmm0
547 movdqa 0x30(%rsp),%xmm1
549 punpckldq %xmm1,%xmm2
550 punpckhdq %xmm1,%xmm0
551 movdqa %xmm2,0x20(%rsp)
552 movdqa %xmm0,0x30(%rsp)
554 punpckldq %xmm5,%xmm4
555 punpckhdq %xmm5,%xmm0
558 punpckldq %xmm7,%xmm6
559 punpckhdq %xmm7,%xmm0
562 punpckldq %xmm9,%xmm8
563 punpckhdq %xmm9,%xmm0
566 punpckldq %xmm11,%xmm10
567 punpckhdq %xmm11,%xmm0
570 punpckldq %xmm13,%xmm12
571 punpckhdq %xmm13,%xmm0
574 punpckldq %xmm15,%xmm14
575 punpckhdq %xmm15,%xmm0
578 # interleave 64-bit words in state n, n+2
579 movdqa 0x00(%rsp),%xmm0
580 movdqa 0x20(%rsp),%xmm1
582 punpcklqdq %xmm1,%xmm2
583 punpckhqdq %xmm1,%xmm0
584 movdqa %xmm2,0x00(%rsp)
585 movdqa %xmm0,0x20(%rsp)
586 movdqa 0x10(%rsp),%xmm0
587 movdqa 0x30(%rsp),%xmm1
589 punpcklqdq %xmm1,%xmm2
590 punpckhqdq %xmm1,%xmm0
591 movdqa %xmm2,0x10(%rsp)
592 movdqa %xmm0,0x30(%rsp)
594 punpcklqdq %xmm6,%xmm4
595 punpckhqdq %xmm6,%xmm0
598 punpcklqdq %xmm7,%xmm5
599 punpckhqdq %xmm7,%xmm0
602 punpcklqdq %xmm10,%xmm8
603 punpckhqdq %xmm10,%xmm0
606 punpcklqdq %xmm11,%xmm9
607 punpckhqdq %xmm11,%xmm0
610 punpcklqdq %xmm14,%xmm12
611 punpckhqdq %xmm14,%xmm0
614 punpcklqdq %xmm15,%xmm13
615 punpckhqdq %xmm15,%xmm0
618 # xor with corresponding input, write to output
619 movdqa 0x00(%rsp),%xmm0
620 movdqu 0x00(%rdx),%xmm1
622 movdqu %xmm0,0x00(%rsi)
623 movdqa 0x10(%rsp),%xmm0
624 movdqu 0x80(%rdx),%xmm1
626 movdqu %xmm0,0x80(%rsi)
627 movdqa 0x20(%rsp),%xmm0
628 movdqu 0x40(%rdx),%xmm1
630 movdqu %xmm0,0x40(%rsi)
631 movdqa 0x30(%rsp),%xmm0
632 movdqu 0xc0(%rdx),%xmm1
634 movdqu %xmm0,0xc0(%rsi)
635 movdqu 0x10(%rdx),%xmm1
637 movdqu %xmm4,0x10(%rsi)
638 movdqu 0x90(%rdx),%xmm1
640 movdqu %xmm5,0x90(%rsi)
641 movdqu 0x50(%rdx),%xmm1
643 movdqu %xmm6,0x50(%rsi)
644 movdqu 0xd0(%rdx),%xmm1
646 movdqu %xmm7,0xd0(%rsi)
647 movdqu 0x20(%rdx),%xmm1
649 movdqu %xmm8,0x20(%rsi)
650 movdqu 0xa0(%rdx),%xmm1
652 movdqu %xmm9,0xa0(%rsi)
653 movdqu 0x60(%rdx),%xmm1
655 movdqu %xmm10,0x60(%rsi)
656 movdqu 0xe0(%rdx),%xmm1
658 movdqu %xmm11,0xe0(%rsi)
659 movdqu 0x30(%rdx),%xmm1
661 movdqu %xmm12,0x30(%rsi)
662 movdqu 0xb0(%rdx),%xmm1
664 movdqu %xmm13,0xb0(%rsi)
665 movdqu 0x70(%rdx),%xmm1
667 movdqu %xmm14,0x70(%rsi)
668 movdqu 0xf0(%rdx),%xmm1
670 movdqu %xmm15,0xf0(%rsi)
674 ENDPROC(chacha20_4block_xor_ssse3)