2 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
14 .section .rodata.cst32.ROT8, "aM", @progbits, 32
16 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
17 .octa 0x0e0d0c0f0a09080b0605040702010003
19 .section .rodata.cst32.ROT16, "aM", @progbits, 32
21 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
22 .octa 0x0d0c0f0e09080b0a0504070601000302
24 .section .rodata.cst32.CTRINC, "aM", @progbits, 32
26 CTRINC: .octa 0x00000003000000020000000100000000
27 .octa 0x00000007000000060000000500000004
29 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
31 CTR2BL: .octa 0x00000000000000000000000000000000
32 .octa 0x00000000000000000000000000000001
34 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
36 CTR4BL: .octa 0x00000000000000000000000000000002
37 .octa 0x00000000000000000000000000000003
41 ENTRY(chacha_2block_xor_avx2)
42 # %rdi: Input state matrix, s
43 # %rsi: up to 2 data blocks output, o
44 # %rdx: up to 2 data blocks input, i
45 # %rcx: input/output length in bytes
48 # This function encrypts two ChaCha blocks by loading the state
49 # matrix twice across four AVX registers. It performs matrix operations
50 # on four words in each matrix in parallel, but requires shuffling to
51 # rearrange the words after each round.
56 vbroadcasti128 0x00(%rdi),%ymm0
57 vbroadcasti128 0x10(%rdi),%ymm1
58 vbroadcasti128 0x20(%rdi),%ymm2
59 vbroadcasti128 0x30(%rdi),%ymm3
61 vpaddd CTR2BL(%rip),%ymm3,%ymm3
68 vmovdqa ROT8(%rip),%ymm4
69 vmovdqa ROT16(%rip),%ymm5
75 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
76 vpaddd %ymm1,%ymm0,%ymm0
77 vpxor %ymm0,%ymm3,%ymm3
78 vpshufb %ymm5,%ymm3,%ymm3
80 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
81 vpaddd %ymm3,%ymm2,%ymm2
82 vpxor %ymm2,%ymm1,%ymm1
84 vpslld $12,%ymm6,%ymm6
85 vpsrld $20,%ymm1,%ymm1
86 vpor %ymm6,%ymm1,%ymm1
88 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
89 vpaddd %ymm1,%ymm0,%ymm0
90 vpxor %ymm0,%ymm3,%ymm3
91 vpshufb %ymm4,%ymm3,%ymm3
93 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
94 vpaddd %ymm3,%ymm2,%ymm2
95 vpxor %ymm2,%ymm1,%ymm1
98 vpsrld $25,%ymm1,%ymm1
99 vpor %ymm7,%ymm1,%ymm1
101 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
102 vpshufd $0x39,%ymm1,%ymm1
103 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104 vpshufd $0x4e,%ymm2,%ymm2
105 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
106 vpshufd $0x93,%ymm3,%ymm3
108 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
109 vpaddd %ymm1,%ymm0,%ymm0
110 vpxor %ymm0,%ymm3,%ymm3
111 vpshufb %ymm5,%ymm3,%ymm3
113 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
114 vpaddd %ymm3,%ymm2,%ymm2
115 vpxor %ymm2,%ymm1,%ymm1
117 vpslld $12,%ymm6,%ymm6
118 vpsrld $20,%ymm1,%ymm1
119 vpor %ymm6,%ymm1,%ymm1
121 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
122 vpaddd %ymm1,%ymm0,%ymm0
123 vpxor %ymm0,%ymm3,%ymm3
124 vpshufb %ymm4,%ymm3,%ymm3
126 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
127 vpaddd %ymm3,%ymm2,%ymm2
128 vpxor %ymm2,%ymm1,%ymm1
130 vpslld $7,%ymm7,%ymm7
131 vpsrld $25,%ymm1,%ymm1
132 vpor %ymm7,%ymm1,%ymm1
134 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
135 vpshufd $0x93,%ymm1,%ymm1
136 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
137 vpshufd $0x4e,%ymm2,%ymm2
138 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
139 vpshufd $0x39,%ymm3,%ymm3
144 # o0 = i0 ^ (x0 + s0)
145 vpaddd %ymm8,%ymm0,%ymm7
148 vpxor 0x00(%rdx),%xmm7,%xmm6
149 vmovdqu %xmm6,0x00(%rsi)
150 vextracti128 $1,%ymm7,%xmm0
151 # o1 = i1 ^ (x1 + s1)
152 vpaddd %ymm9,%ymm1,%ymm7
155 vpxor 0x10(%rdx),%xmm7,%xmm6
156 vmovdqu %xmm6,0x10(%rsi)
157 vextracti128 $1,%ymm7,%xmm1
158 # o2 = i2 ^ (x2 + s2)
159 vpaddd %ymm10,%ymm2,%ymm7
162 vpxor 0x20(%rdx),%xmm7,%xmm6
163 vmovdqu %xmm6,0x20(%rsi)
164 vextracti128 $1,%ymm7,%xmm2
165 # o3 = i3 ^ (x3 + s3)
166 vpaddd %ymm11,%ymm3,%ymm7
169 vpxor 0x30(%rdx),%xmm7,%xmm6
170 vmovdqu %xmm6,0x30(%rsi)
171 vextracti128 $1,%ymm7,%xmm3
173 # xor and write second block
177 vpxor 0x40(%rdx),%xmm7,%xmm6
178 vmovdqu %xmm6,0x40(%rsi)
183 vpxor 0x50(%rdx),%xmm7,%xmm6
184 vmovdqu %xmm6,0x50(%rsi)
189 vpxor 0x60(%rdx),%xmm7,%xmm6
190 vmovdqu %xmm6,0x60(%rsi)
195 vpxor 0x70(%rdx),%xmm7,%xmm6
196 vmovdqu %xmm6,0x70(%rsi)
203 # xor remaining bytes from partial register into output
220 vpxor 0x00(%rsp),%xmm7,%xmm7
221 vmovdqa %xmm7,0x00(%rsp)
231 ENDPROC(chacha_2block_xor_avx2)
233 ENTRY(chacha_4block_xor_avx2)
234 # %rdi: Input state matrix, s
235 # %rsi: up to 4 data blocks output, o
236 # %rdx: up to 4 data blocks input, i
237 # %rcx: input/output length in bytes
240 # This function encrypts four ChaCha blocks by loading the state
241 # matrix four times across eight AVX registers. It performs matrix
242 # operations on four words in two matrices in parallel, sequentially
243 # to the operations on the four words of the other two matrices. The
244 # required word shuffling has a rather high latency, we can do the
245 # arithmetic on two matrix-pairs without much slowdown.
250 vbroadcasti128 0x00(%rdi),%ymm0
251 vbroadcasti128 0x10(%rdi),%ymm1
252 vbroadcasti128 0x20(%rdi),%ymm2
253 vbroadcasti128 0x30(%rdi),%ymm3
260 vpaddd CTR2BL(%rip),%ymm3,%ymm3
261 vpaddd CTR4BL(%rip),%ymm7,%ymm7
269 vmovdqa ROT8(%rip),%ymm8
270 vmovdqa ROT16(%rip),%ymm9
276 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277 vpaddd %ymm1,%ymm0,%ymm0
278 vpxor %ymm0,%ymm3,%ymm3
279 vpshufb %ymm9,%ymm3,%ymm3
281 vpaddd %ymm5,%ymm4,%ymm4
282 vpxor %ymm4,%ymm7,%ymm7
283 vpshufb %ymm9,%ymm7,%ymm7
285 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286 vpaddd %ymm3,%ymm2,%ymm2
287 vpxor %ymm2,%ymm1,%ymm1
289 vpslld $12,%ymm10,%ymm10
290 vpsrld $20,%ymm1,%ymm1
291 vpor %ymm10,%ymm1,%ymm1
293 vpaddd %ymm7,%ymm6,%ymm6
294 vpxor %ymm6,%ymm5,%ymm5
296 vpslld $12,%ymm10,%ymm10
297 vpsrld $20,%ymm5,%ymm5
298 vpor %ymm10,%ymm5,%ymm5
300 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
301 vpaddd %ymm1,%ymm0,%ymm0
302 vpxor %ymm0,%ymm3,%ymm3
303 vpshufb %ymm8,%ymm3,%ymm3
305 vpaddd %ymm5,%ymm4,%ymm4
306 vpxor %ymm4,%ymm7,%ymm7
307 vpshufb %ymm8,%ymm7,%ymm7
309 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
310 vpaddd %ymm3,%ymm2,%ymm2
311 vpxor %ymm2,%ymm1,%ymm1
313 vpslld $7,%ymm10,%ymm10
314 vpsrld $25,%ymm1,%ymm1
315 vpor %ymm10,%ymm1,%ymm1
317 vpaddd %ymm7,%ymm6,%ymm6
318 vpxor %ymm6,%ymm5,%ymm5
320 vpslld $7,%ymm10,%ymm10
321 vpsrld $25,%ymm5,%ymm5
322 vpor %ymm10,%ymm5,%ymm5
324 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
325 vpshufd $0x39,%ymm1,%ymm1
326 vpshufd $0x39,%ymm5,%ymm5
327 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
328 vpshufd $0x4e,%ymm2,%ymm2
329 vpshufd $0x4e,%ymm6,%ymm6
330 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
331 vpshufd $0x93,%ymm3,%ymm3
332 vpshufd $0x93,%ymm7,%ymm7
334 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
335 vpaddd %ymm1,%ymm0,%ymm0
336 vpxor %ymm0,%ymm3,%ymm3
337 vpshufb %ymm9,%ymm3,%ymm3
339 vpaddd %ymm5,%ymm4,%ymm4
340 vpxor %ymm4,%ymm7,%ymm7
341 vpshufb %ymm9,%ymm7,%ymm7
343 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
344 vpaddd %ymm3,%ymm2,%ymm2
345 vpxor %ymm2,%ymm1,%ymm1
347 vpslld $12,%ymm10,%ymm10
348 vpsrld $20,%ymm1,%ymm1
349 vpor %ymm10,%ymm1,%ymm1
351 vpaddd %ymm7,%ymm6,%ymm6
352 vpxor %ymm6,%ymm5,%ymm5
354 vpslld $12,%ymm10,%ymm10
355 vpsrld $20,%ymm5,%ymm5
356 vpor %ymm10,%ymm5,%ymm5
358 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
359 vpaddd %ymm1,%ymm0,%ymm0
360 vpxor %ymm0,%ymm3,%ymm3
361 vpshufb %ymm8,%ymm3,%ymm3
363 vpaddd %ymm5,%ymm4,%ymm4
364 vpxor %ymm4,%ymm7,%ymm7
365 vpshufb %ymm8,%ymm7,%ymm7
367 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
368 vpaddd %ymm3,%ymm2,%ymm2
369 vpxor %ymm2,%ymm1,%ymm1
371 vpslld $7,%ymm10,%ymm10
372 vpsrld $25,%ymm1,%ymm1
373 vpor %ymm10,%ymm1,%ymm1
375 vpaddd %ymm7,%ymm6,%ymm6
376 vpxor %ymm6,%ymm5,%ymm5
378 vpslld $7,%ymm10,%ymm10
379 vpsrld $25,%ymm5,%ymm5
380 vpor %ymm10,%ymm5,%ymm5
382 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
383 vpshufd $0x93,%ymm1,%ymm1
384 vpshufd $0x93,%ymm5,%ymm5
385 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
386 vpshufd $0x4e,%ymm2,%ymm2
387 vpshufd $0x4e,%ymm6,%ymm6
388 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
389 vpshufd $0x39,%ymm3,%ymm3
390 vpshufd $0x39,%ymm7,%ymm7
395 # o0 = i0 ^ (x0 + s0), first block
396 vpaddd %ymm11,%ymm0,%ymm10
399 vpxor 0x00(%rdx),%xmm10,%xmm9
400 vmovdqu %xmm9,0x00(%rsi)
401 vextracti128 $1,%ymm10,%xmm0
402 # o1 = i1 ^ (x1 + s1), first block
403 vpaddd %ymm12,%ymm1,%ymm10
406 vpxor 0x10(%rdx),%xmm10,%xmm9
407 vmovdqu %xmm9,0x10(%rsi)
408 vextracti128 $1,%ymm10,%xmm1
409 # o2 = i2 ^ (x2 + s2), first block
410 vpaddd %ymm13,%ymm2,%ymm10
413 vpxor 0x20(%rdx),%xmm10,%xmm9
414 vmovdqu %xmm9,0x20(%rsi)
415 vextracti128 $1,%ymm10,%xmm2
416 # o3 = i3 ^ (x3 + s3), first block
417 vpaddd %ymm14,%ymm3,%ymm10
420 vpxor 0x30(%rdx),%xmm10,%xmm9
421 vmovdqu %xmm9,0x30(%rsi)
422 vextracti128 $1,%ymm10,%xmm3
424 # xor and write second block
428 vpxor 0x40(%rdx),%xmm10,%xmm9
429 vmovdqu %xmm9,0x40(%rsi)
434 vpxor 0x50(%rdx),%xmm10,%xmm9
435 vmovdqu %xmm9,0x50(%rsi)
440 vpxor 0x60(%rdx),%xmm10,%xmm9
441 vmovdqu %xmm9,0x60(%rsi)
446 vpxor 0x70(%rdx),%xmm10,%xmm9
447 vmovdqu %xmm9,0x70(%rsi)
449 # o0 = i0 ^ (x0 + s0), third block
450 vpaddd %ymm11,%ymm4,%ymm10
453 vpxor 0x80(%rdx),%xmm10,%xmm9
454 vmovdqu %xmm9,0x80(%rsi)
455 vextracti128 $1,%ymm10,%xmm4
456 # o1 = i1 ^ (x1 + s1), third block
457 vpaddd %ymm12,%ymm5,%ymm10
460 vpxor 0x90(%rdx),%xmm10,%xmm9
461 vmovdqu %xmm9,0x90(%rsi)
462 vextracti128 $1,%ymm10,%xmm5
463 # o2 = i2 ^ (x2 + s2), third block
464 vpaddd %ymm13,%ymm6,%ymm10
467 vpxor 0xa0(%rdx),%xmm10,%xmm9
468 vmovdqu %xmm9,0xa0(%rsi)
469 vextracti128 $1,%ymm10,%xmm6
470 # o3 = i3 ^ (x3 + s3), third block
471 vpaddd %ymm15,%ymm7,%ymm10
474 vpxor 0xb0(%rdx),%xmm10,%xmm9
475 vmovdqu %xmm9,0xb0(%rsi)
476 vextracti128 $1,%ymm10,%xmm7
478 # xor and write fourth block
482 vpxor 0xc0(%rdx),%xmm10,%xmm9
483 vmovdqu %xmm9,0xc0(%rsi)
488 vpxor 0xd0(%rdx),%xmm10,%xmm9
489 vmovdqu %xmm9,0xd0(%rsi)
494 vpxor 0xe0(%rdx),%xmm10,%xmm9
495 vmovdqu %xmm9,0xe0(%rsi)
500 vpxor 0xf0(%rdx),%xmm10,%xmm9
501 vmovdqu %xmm9,0xf0(%rsi)
508 # xor remaining bytes from partial register into output
525 vpxor 0x00(%rsp),%xmm10,%xmm10
526 vmovdqa %xmm10,0x00(%rsp)
536 ENDPROC(chacha_4block_xor_avx2)
538 ENTRY(chacha_8block_xor_avx2)
539 # %rdi: Input state matrix, s
540 # %rsi: up to 8 data blocks output, o
541 # %rdx: up to 8 data blocks input, i
542 # %rcx: input/output length in bytes
545 # This function encrypts eight consecutive ChaCha blocks by loading
546 # the state matrix in AVX registers eight times. As we need some
547 # scratch registers, we save the first four registers on the stack. The
548 # algorithm performs each operation on the corresponding word of each
549 # state matrix, hence requires no word shuffling. For final XORing step
550 # we transpose the matrix by interleaving 32-, 64- and then 128-bit
551 # words, which allows us to do XOR in AVX registers. 8/16-bit word
552 # rotation is done with the slightly better performing byte shuffling,
553 # 7/12-bit word rotation uses traditional shift+OR.
556 # 4 * 32 byte stack, 32-byte aligned
562 # x0..15[0-7] = s[0..15]
563 vpbroadcastd 0x00(%rdi),%ymm0
564 vpbroadcastd 0x04(%rdi),%ymm1
565 vpbroadcastd 0x08(%rdi),%ymm2
566 vpbroadcastd 0x0c(%rdi),%ymm3
567 vpbroadcastd 0x10(%rdi),%ymm4
568 vpbroadcastd 0x14(%rdi),%ymm5
569 vpbroadcastd 0x18(%rdi),%ymm6
570 vpbroadcastd 0x1c(%rdi),%ymm7
571 vpbroadcastd 0x20(%rdi),%ymm8
572 vpbroadcastd 0x24(%rdi),%ymm9
573 vpbroadcastd 0x28(%rdi),%ymm10
574 vpbroadcastd 0x2c(%rdi),%ymm11
575 vpbroadcastd 0x30(%rdi),%ymm12
576 vpbroadcastd 0x34(%rdi),%ymm13
577 vpbroadcastd 0x38(%rdi),%ymm14
578 vpbroadcastd 0x3c(%rdi),%ymm15
580 vmovdqa %ymm0,0x00(%rsp)
581 vmovdqa %ymm1,0x20(%rsp)
582 vmovdqa %ymm2,0x40(%rsp)
583 vmovdqa %ymm3,0x60(%rsp)
585 vmovdqa CTRINC(%rip),%ymm1
586 vmovdqa ROT8(%rip),%ymm2
587 vmovdqa ROT16(%rip),%ymm3
589 # x12 += counter values 0-3
590 vpaddd %ymm1,%ymm12,%ymm12
593 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
594 vpaddd 0x00(%rsp),%ymm4,%ymm0
595 vmovdqa %ymm0,0x00(%rsp)
596 vpxor %ymm0,%ymm12,%ymm12
597 vpshufb %ymm3,%ymm12,%ymm12
598 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
599 vpaddd 0x20(%rsp),%ymm5,%ymm0
600 vmovdqa %ymm0,0x20(%rsp)
601 vpxor %ymm0,%ymm13,%ymm13
602 vpshufb %ymm3,%ymm13,%ymm13
603 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
604 vpaddd 0x40(%rsp),%ymm6,%ymm0
605 vmovdqa %ymm0,0x40(%rsp)
606 vpxor %ymm0,%ymm14,%ymm14
607 vpshufb %ymm3,%ymm14,%ymm14
608 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
609 vpaddd 0x60(%rsp),%ymm7,%ymm0
610 vmovdqa %ymm0,0x60(%rsp)
611 vpxor %ymm0,%ymm15,%ymm15
612 vpshufb %ymm3,%ymm15,%ymm15
614 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
615 vpaddd %ymm12,%ymm8,%ymm8
616 vpxor %ymm8,%ymm4,%ymm4
617 vpslld $12,%ymm4,%ymm0
618 vpsrld $20,%ymm4,%ymm4
619 vpor %ymm0,%ymm4,%ymm4
620 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
621 vpaddd %ymm13,%ymm9,%ymm9
622 vpxor %ymm9,%ymm5,%ymm5
623 vpslld $12,%ymm5,%ymm0
624 vpsrld $20,%ymm5,%ymm5
625 vpor %ymm0,%ymm5,%ymm5
626 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
627 vpaddd %ymm14,%ymm10,%ymm10
628 vpxor %ymm10,%ymm6,%ymm6
629 vpslld $12,%ymm6,%ymm0
630 vpsrld $20,%ymm6,%ymm6
631 vpor %ymm0,%ymm6,%ymm6
632 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
633 vpaddd %ymm15,%ymm11,%ymm11
634 vpxor %ymm11,%ymm7,%ymm7
635 vpslld $12,%ymm7,%ymm0
636 vpsrld $20,%ymm7,%ymm7
637 vpor %ymm0,%ymm7,%ymm7
639 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
640 vpaddd 0x00(%rsp),%ymm4,%ymm0
641 vmovdqa %ymm0,0x00(%rsp)
642 vpxor %ymm0,%ymm12,%ymm12
643 vpshufb %ymm2,%ymm12,%ymm12
644 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
645 vpaddd 0x20(%rsp),%ymm5,%ymm0
646 vmovdqa %ymm0,0x20(%rsp)
647 vpxor %ymm0,%ymm13,%ymm13
648 vpshufb %ymm2,%ymm13,%ymm13
649 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
650 vpaddd 0x40(%rsp),%ymm6,%ymm0
651 vmovdqa %ymm0,0x40(%rsp)
652 vpxor %ymm0,%ymm14,%ymm14
653 vpshufb %ymm2,%ymm14,%ymm14
654 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
655 vpaddd 0x60(%rsp),%ymm7,%ymm0
656 vmovdqa %ymm0,0x60(%rsp)
657 vpxor %ymm0,%ymm15,%ymm15
658 vpshufb %ymm2,%ymm15,%ymm15
660 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
661 vpaddd %ymm12,%ymm8,%ymm8
662 vpxor %ymm8,%ymm4,%ymm4
663 vpslld $7,%ymm4,%ymm0
664 vpsrld $25,%ymm4,%ymm4
665 vpor %ymm0,%ymm4,%ymm4
666 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
667 vpaddd %ymm13,%ymm9,%ymm9
668 vpxor %ymm9,%ymm5,%ymm5
669 vpslld $7,%ymm5,%ymm0
670 vpsrld $25,%ymm5,%ymm5
671 vpor %ymm0,%ymm5,%ymm5
672 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
673 vpaddd %ymm14,%ymm10,%ymm10
674 vpxor %ymm10,%ymm6,%ymm6
675 vpslld $7,%ymm6,%ymm0
676 vpsrld $25,%ymm6,%ymm6
677 vpor %ymm0,%ymm6,%ymm6
678 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
679 vpaddd %ymm15,%ymm11,%ymm11
680 vpxor %ymm11,%ymm7,%ymm7
681 vpslld $7,%ymm7,%ymm0
682 vpsrld $25,%ymm7,%ymm7
683 vpor %ymm0,%ymm7,%ymm7
685 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
686 vpaddd 0x00(%rsp),%ymm5,%ymm0
687 vmovdqa %ymm0,0x00(%rsp)
688 vpxor %ymm0,%ymm15,%ymm15
689 vpshufb %ymm3,%ymm15,%ymm15
690 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
691 vpaddd 0x20(%rsp),%ymm6,%ymm0
692 vmovdqa %ymm0,0x20(%rsp)
693 vpxor %ymm0,%ymm12,%ymm12
694 vpshufb %ymm3,%ymm12,%ymm12
695 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
696 vpaddd 0x40(%rsp),%ymm7,%ymm0
697 vmovdqa %ymm0,0x40(%rsp)
698 vpxor %ymm0,%ymm13,%ymm13
699 vpshufb %ymm3,%ymm13,%ymm13
700 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
701 vpaddd 0x60(%rsp),%ymm4,%ymm0
702 vmovdqa %ymm0,0x60(%rsp)
703 vpxor %ymm0,%ymm14,%ymm14
704 vpshufb %ymm3,%ymm14,%ymm14
706 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
707 vpaddd %ymm15,%ymm10,%ymm10
708 vpxor %ymm10,%ymm5,%ymm5
709 vpslld $12,%ymm5,%ymm0
710 vpsrld $20,%ymm5,%ymm5
711 vpor %ymm0,%ymm5,%ymm5
712 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
713 vpaddd %ymm12,%ymm11,%ymm11
714 vpxor %ymm11,%ymm6,%ymm6
715 vpslld $12,%ymm6,%ymm0
716 vpsrld $20,%ymm6,%ymm6
717 vpor %ymm0,%ymm6,%ymm6
718 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
719 vpaddd %ymm13,%ymm8,%ymm8
720 vpxor %ymm8,%ymm7,%ymm7
721 vpslld $12,%ymm7,%ymm0
722 vpsrld $20,%ymm7,%ymm7
723 vpor %ymm0,%ymm7,%ymm7
724 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
725 vpaddd %ymm14,%ymm9,%ymm9
726 vpxor %ymm9,%ymm4,%ymm4
727 vpslld $12,%ymm4,%ymm0
728 vpsrld $20,%ymm4,%ymm4
729 vpor %ymm0,%ymm4,%ymm4
731 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
732 vpaddd 0x00(%rsp),%ymm5,%ymm0
733 vmovdqa %ymm0,0x00(%rsp)
734 vpxor %ymm0,%ymm15,%ymm15
735 vpshufb %ymm2,%ymm15,%ymm15
736 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
737 vpaddd 0x20(%rsp),%ymm6,%ymm0
738 vmovdqa %ymm0,0x20(%rsp)
739 vpxor %ymm0,%ymm12,%ymm12
740 vpshufb %ymm2,%ymm12,%ymm12
741 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
742 vpaddd 0x40(%rsp),%ymm7,%ymm0
743 vmovdqa %ymm0,0x40(%rsp)
744 vpxor %ymm0,%ymm13,%ymm13
745 vpshufb %ymm2,%ymm13,%ymm13
746 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
747 vpaddd 0x60(%rsp),%ymm4,%ymm0
748 vmovdqa %ymm0,0x60(%rsp)
749 vpxor %ymm0,%ymm14,%ymm14
750 vpshufb %ymm2,%ymm14,%ymm14
752 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
753 vpaddd %ymm15,%ymm10,%ymm10
754 vpxor %ymm10,%ymm5,%ymm5
755 vpslld $7,%ymm5,%ymm0
756 vpsrld $25,%ymm5,%ymm5
757 vpor %ymm0,%ymm5,%ymm5
758 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
759 vpaddd %ymm12,%ymm11,%ymm11
760 vpxor %ymm11,%ymm6,%ymm6
761 vpslld $7,%ymm6,%ymm0
762 vpsrld $25,%ymm6,%ymm6
763 vpor %ymm0,%ymm6,%ymm6
764 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
765 vpaddd %ymm13,%ymm8,%ymm8
766 vpxor %ymm8,%ymm7,%ymm7
767 vpslld $7,%ymm7,%ymm0
768 vpsrld $25,%ymm7,%ymm7
769 vpor %ymm0,%ymm7,%ymm7
770 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
771 vpaddd %ymm14,%ymm9,%ymm9
772 vpxor %ymm9,%ymm4,%ymm4
773 vpslld $7,%ymm4,%ymm0
774 vpsrld $25,%ymm4,%ymm4
775 vpor %ymm0,%ymm4,%ymm4
780 # x0..15[0-3] += s[0..15]
781 vpbroadcastd 0x00(%rdi),%ymm0
782 vpaddd 0x00(%rsp),%ymm0,%ymm0
783 vmovdqa %ymm0,0x00(%rsp)
784 vpbroadcastd 0x04(%rdi),%ymm0
785 vpaddd 0x20(%rsp),%ymm0,%ymm0
786 vmovdqa %ymm0,0x20(%rsp)
787 vpbroadcastd 0x08(%rdi),%ymm0
788 vpaddd 0x40(%rsp),%ymm0,%ymm0
789 vmovdqa %ymm0,0x40(%rsp)
790 vpbroadcastd 0x0c(%rdi),%ymm0
791 vpaddd 0x60(%rsp),%ymm0,%ymm0
792 vmovdqa %ymm0,0x60(%rsp)
793 vpbroadcastd 0x10(%rdi),%ymm0
794 vpaddd %ymm0,%ymm4,%ymm4
795 vpbroadcastd 0x14(%rdi),%ymm0
796 vpaddd %ymm0,%ymm5,%ymm5
797 vpbroadcastd 0x18(%rdi),%ymm0
798 vpaddd %ymm0,%ymm6,%ymm6
799 vpbroadcastd 0x1c(%rdi),%ymm0
800 vpaddd %ymm0,%ymm7,%ymm7
801 vpbroadcastd 0x20(%rdi),%ymm0
802 vpaddd %ymm0,%ymm8,%ymm8
803 vpbroadcastd 0x24(%rdi),%ymm0
804 vpaddd %ymm0,%ymm9,%ymm9
805 vpbroadcastd 0x28(%rdi),%ymm0
806 vpaddd %ymm0,%ymm10,%ymm10
807 vpbroadcastd 0x2c(%rdi),%ymm0
808 vpaddd %ymm0,%ymm11,%ymm11
809 vpbroadcastd 0x30(%rdi),%ymm0
810 vpaddd %ymm0,%ymm12,%ymm12
811 vpbroadcastd 0x34(%rdi),%ymm0
812 vpaddd %ymm0,%ymm13,%ymm13
813 vpbroadcastd 0x38(%rdi),%ymm0
814 vpaddd %ymm0,%ymm14,%ymm14
815 vpbroadcastd 0x3c(%rdi),%ymm0
816 vpaddd %ymm0,%ymm15,%ymm15
818 # x12 += counter values 0-3
819 vpaddd %ymm1,%ymm12,%ymm12
821 # interleave 32-bit words in state n, n+1
822 vmovdqa 0x00(%rsp),%ymm0
823 vmovdqa 0x20(%rsp),%ymm1
824 vpunpckldq %ymm1,%ymm0,%ymm2
825 vpunpckhdq %ymm1,%ymm0,%ymm1
826 vmovdqa %ymm2,0x00(%rsp)
827 vmovdqa %ymm1,0x20(%rsp)
828 vmovdqa 0x40(%rsp),%ymm0
829 vmovdqa 0x60(%rsp),%ymm1
830 vpunpckldq %ymm1,%ymm0,%ymm2
831 vpunpckhdq %ymm1,%ymm0,%ymm1
832 vmovdqa %ymm2,0x40(%rsp)
833 vmovdqa %ymm1,0x60(%rsp)
835 vpunpckldq %ymm5,%ymm0,%ymm4
836 vpunpckhdq %ymm5,%ymm0,%ymm5
838 vpunpckldq %ymm7,%ymm0,%ymm6
839 vpunpckhdq %ymm7,%ymm0,%ymm7
841 vpunpckldq %ymm9,%ymm0,%ymm8
842 vpunpckhdq %ymm9,%ymm0,%ymm9
844 vpunpckldq %ymm11,%ymm0,%ymm10
845 vpunpckhdq %ymm11,%ymm0,%ymm11
847 vpunpckldq %ymm13,%ymm0,%ymm12
848 vpunpckhdq %ymm13,%ymm0,%ymm13
850 vpunpckldq %ymm15,%ymm0,%ymm14
851 vpunpckhdq %ymm15,%ymm0,%ymm15
853 # interleave 64-bit words in state n, n+2
854 vmovdqa 0x00(%rsp),%ymm0
855 vmovdqa 0x40(%rsp),%ymm2
856 vpunpcklqdq %ymm2,%ymm0,%ymm1
857 vpunpckhqdq %ymm2,%ymm0,%ymm2
858 vmovdqa %ymm1,0x00(%rsp)
859 vmovdqa %ymm2,0x40(%rsp)
860 vmovdqa 0x20(%rsp),%ymm0
861 vmovdqa 0x60(%rsp),%ymm2
862 vpunpcklqdq %ymm2,%ymm0,%ymm1
863 vpunpckhqdq %ymm2,%ymm0,%ymm2
864 vmovdqa %ymm1,0x20(%rsp)
865 vmovdqa %ymm2,0x60(%rsp)
867 vpunpcklqdq %ymm6,%ymm0,%ymm4
868 vpunpckhqdq %ymm6,%ymm0,%ymm6
870 vpunpcklqdq %ymm7,%ymm0,%ymm5
871 vpunpckhqdq %ymm7,%ymm0,%ymm7
873 vpunpcklqdq %ymm10,%ymm0,%ymm8
874 vpunpckhqdq %ymm10,%ymm0,%ymm10
876 vpunpcklqdq %ymm11,%ymm0,%ymm9
877 vpunpckhqdq %ymm11,%ymm0,%ymm11
879 vpunpcklqdq %ymm14,%ymm0,%ymm12
880 vpunpckhqdq %ymm14,%ymm0,%ymm14
882 vpunpcklqdq %ymm15,%ymm0,%ymm13
883 vpunpckhqdq %ymm15,%ymm0,%ymm15
885 # interleave 128-bit words in state n, n+4
886 # xor/write first four blocks
887 vmovdqa 0x00(%rsp),%ymm1
888 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
891 vpxor 0x0000(%rdx),%ymm0,%ymm0
892 vmovdqu %ymm0,0x0000(%rsi)
893 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
895 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
898 vpxor 0x0020(%rdx),%ymm0,%ymm0
899 vmovdqu %ymm0,0x0020(%rsi)
900 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
902 vmovdqa 0x40(%rsp),%ymm1
903 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
906 vpxor 0x0040(%rdx),%ymm0,%ymm0
907 vmovdqu %ymm0,0x0040(%rsi)
908 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
910 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
913 vpxor 0x0060(%rdx),%ymm0,%ymm0
914 vmovdqu %ymm0,0x0060(%rsi)
915 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
917 vmovdqa 0x20(%rsp),%ymm1
918 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
921 vpxor 0x0080(%rdx),%ymm0,%ymm0
922 vmovdqu %ymm0,0x0080(%rsi)
923 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
925 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
928 vpxor 0x00a0(%rdx),%ymm0,%ymm0
929 vmovdqu %ymm0,0x00a0(%rsi)
930 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
932 vmovdqa 0x60(%rsp),%ymm1
933 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
936 vpxor 0x00c0(%rdx),%ymm0,%ymm0
937 vmovdqu %ymm0,0x00c0(%rsi)
938 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
940 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
943 vpxor 0x00e0(%rdx),%ymm0,%ymm0
944 vmovdqu %ymm0,0x00e0(%rsi)
945 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
947 # xor remaining blocks, write to output
951 vpxor 0x0100(%rdx),%ymm0,%ymm0
952 vmovdqu %ymm0,0x0100(%rsi)
957 vpxor 0x0120(%rdx),%ymm0,%ymm0
958 vmovdqu %ymm0,0x0120(%rsi)
963 vpxor 0x0140(%rdx),%ymm0,%ymm0
964 vmovdqu %ymm0,0x0140(%rsi)
969 vpxor 0x0160(%rdx),%ymm0,%ymm0
970 vmovdqu %ymm0,0x0160(%rsi)
975 vpxor 0x0180(%rdx),%ymm0,%ymm0
976 vmovdqu %ymm0,0x0180(%rsi)
981 vpxor 0x01a0(%rdx),%ymm0,%ymm0
982 vmovdqu %ymm0,0x01a0(%rsi)
987 vpxor 0x01c0(%rdx),%ymm0,%ymm0
988 vmovdqu %ymm0,0x01c0(%rsi)
993 vpxor 0x01e0(%rdx),%ymm0,%ymm0
994 vmovdqu %ymm0,0x01e0(%rsi)
1002 # xor remaining bytes from partial register into output
1010 lea (%rdx,%rax),%rsi
1015 vpxor 0x00(%rsp),%ymm0,%ymm0
1016 vmovdqa %ymm0,0x00(%rsp)
1019 lea (%r11,%rax),%rdi
1025 ENDPROC(chacha_8block_xor_avx2)