]> asedeno.scripts.mit.edu Git - linux.git/blob - arch/x86/crypto/chacha20-ssse3-x86_64.S
crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant
[linux.git] / arch / x86 / crypto / chacha20-ssse3-x86_64.S
1 /*
2  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13
14 .section        .rodata.cst16.ROT8, "aM", @progbits, 16
15 .align 16
16 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
17 .section        .rodata.cst16.ROT16, "aM", @progbits, 16
18 .align 16
19 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
20 .section        .rodata.cst16.CTRINC, "aM", @progbits, 16
21 .align 16
22 CTRINC: .octa 0x00000003000000020000000100000000
23
24 .text
25
26 ENTRY(chacha20_block_xor_ssse3)
27         # %rdi: Input state matrix, s
28         # %rsi: up to 1 data block output, o
29         # %rdx: up to 1 data block input, i
30         # %rcx: input/output length in bytes
31
32         # This function encrypts one ChaCha20 block by loading the state matrix
33         # in four SSE registers. It performs matrix operation on four words in
34         # parallel, but requires shuffling to rearrange the words after each
35         # round. 8/16-bit word rotation is done with the slightly better
36         # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
37         # traditional shift+OR.
38
39         # x0..3 = s0..3
40         movdqa          0x00(%rdi),%xmm0
41         movdqa          0x10(%rdi),%xmm1
42         movdqa          0x20(%rdi),%xmm2
43         movdqa          0x30(%rdi),%xmm3
44         movdqa          %xmm0,%xmm8
45         movdqa          %xmm1,%xmm9
46         movdqa          %xmm2,%xmm10
47         movdqa          %xmm3,%xmm11
48
49         movdqa          ROT8(%rip),%xmm4
50         movdqa          ROT16(%rip),%xmm5
51
52         mov             %rcx,%rax
53         mov             $10,%ecx
54
55 .Ldoubleround:
56
57         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
58         paddd           %xmm1,%xmm0
59         pxor            %xmm0,%xmm3
60         pshufb          %xmm5,%xmm3
61
62         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
63         paddd           %xmm3,%xmm2
64         pxor            %xmm2,%xmm1
65         movdqa          %xmm1,%xmm6
66         pslld           $12,%xmm6
67         psrld           $20,%xmm1
68         por             %xmm6,%xmm1
69
70         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
71         paddd           %xmm1,%xmm0
72         pxor            %xmm0,%xmm3
73         pshufb          %xmm4,%xmm3
74
75         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
76         paddd           %xmm3,%xmm2
77         pxor            %xmm2,%xmm1
78         movdqa          %xmm1,%xmm7
79         pslld           $7,%xmm7
80         psrld           $25,%xmm1
81         por             %xmm7,%xmm1
82
83         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
84         pshufd          $0x39,%xmm1,%xmm1
85         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
86         pshufd          $0x4e,%xmm2,%xmm2
87         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
88         pshufd          $0x93,%xmm3,%xmm3
89
90         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
91         paddd           %xmm1,%xmm0
92         pxor            %xmm0,%xmm3
93         pshufb          %xmm5,%xmm3
94
95         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
96         paddd           %xmm3,%xmm2
97         pxor            %xmm2,%xmm1
98         movdqa          %xmm1,%xmm6
99         pslld           $12,%xmm6
100         psrld           $20,%xmm1
101         por             %xmm6,%xmm1
102
103         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
104         paddd           %xmm1,%xmm0
105         pxor            %xmm0,%xmm3
106         pshufb          %xmm4,%xmm3
107
108         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
109         paddd           %xmm3,%xmm2
110         pxor            %xmm2,%xmm1
111         movdqa          %xmm1,%xmm7
112         pslld           $7,%xmm7
113         psrld           $25,%xmm1
114         por             %xmm7,%xmm1
115
116         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
117         pshufd          $0x93,%xmm1,%xmm1
118         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
119         pshufd          $0x4e,%xmm2,%xmm2
120         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
121         pshufd          $0x39,%xmm3,%xmm3
122
123         dec             %ecx
124         jnz             .Ldoubleround
125
126         # o0 = i0 ^ (x0 + s0)
127         paddd           %xmm8,%xmm0
128         cmp             $0x10,%rax
129         jl              .Lxorpart
130         movdqu          0x00(%rdx),%xmm4
131         pxor            %xmm4,%xmm0
132         movdqu          %xmm0,0x00(%rsi)
133         # o1 = i1 ^ (x1 + s1)
134         paddd           %xmm9,%xmm1
135         movdqa          %xmm1,%xmm0
136         cmp             $0x20,%rax
137         jl              .Lxorpart
138         movdqu          0x10(%rdx),%xmm0
139         pxor            %xmm1,%xmm0
140         movdqu          %xmm0,0x10(%rsi)
141         # o2 = i2 ^ (x2 + s2)
142         paddd           %xmm10,%xmm2
143         movdqa          %xmm2,%xmm0
144         cmp             $0x30,%rax
145         jl              .Lxorpart
146         movdqu          0x20(%rdx),%xmm0
147         pxor            %xmm2,%xmm0
148         movdqu          %xmm0,0x20(%rsi)
149         # o3 = i3 ^ (x3 + s3)
150         paddd           %xmm11,%xmm3
151         movdqa          %xmm3,%xmm0
152         cmp             $0x40,%rax
153         jl              .Lxorpart
154         movdqu          0x30(%rdx),%xmm0
155         pxor            %xmm3,%xmm0
156         movdqu          %xmm0,0x30(%rsi)
157
158 .Ldone:
159         ret
160
161 .Lxorpart:
162         # xor remaining bytes from partial register into output
163         mov             %rax,%r9
164         and             $0x0f,%r9
165         jz              .Ldone
166         and             $~0x0f,%rax
167
168         mov             %rsi,%r11
169
170         lea             8(%rsp),%r10
171         sub             $0x10,%rsp
172         and             $~31,%rsp
173
174         lea             (%rdx,%rax),%rsi
175         mov             %rsp,%rdi
176         mov             %r9,%rcx
177         rep movsb
178
179         pxor            0x00(%rsp),%xmm0
180         movdqa          %xmm0,0x00(%rsp)
181
182         mov             %rsp,%rsi
183         lea             (%r11,%rax),%rdi
184         mov             %r9,%rcx
185         rep movsb
186
187         lea             -8(%r10),%rsp
188         jmp             .Ldone
189
190 ENDPROC(chacha20_block_xor_ssse3)
191
192 ENTRY(chacha20_4block_xor_ssse3)
193         # %rdi: Input state matrix, s
194         # %rsi: 4 data blocks output, o
195         # %rdx: 4 data blocks input, i
196
197         # This function encrypts four consecutive ChaCha20 blocks by loading the
198         # the state matrix in SSE registers four times. As we need some scratch
199         # registers, we save the first four registers on the stack. The
200         # algorithm performs each operation on the corresponding word of each
201         # state matrix, hence requires no word shuffling. For final XORing step
202         # we transpose the matrix by interleaving 32- and then 64-bit words,
203         # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
204         # done with the slightly better performing SSSE3 byte shuffling,
205         # 7/12-bit word rotation uses traditional shift+OR.
206
207         lea             8(%rsp),%r10
208         sub             $0x80,%rsp
209         and             $~63,%rsp
210
211         # x0..15[0-3] = s0..3[0..3]
212         movq            0x00(%rdi),%xmm1
213         pshufd          $0x00,%xmm1,%xmm0
214         pshufd          $0x55,%xmm1,%xmm1
215         movq            0x08(%rdi),%xmm3
216         pshufd          $0x00,%xmm3,%xmm2
217         pshufd          $0x55,%xmm3,%xmm3
218         movq            0x10(%rdi),%xmm5
219         pshufd          $0x00,%xmm5,%xmm4
220         pshufd          $0x55,%xmm5,%xmm5
221         movq            0x18(%rdi),%xmm7
222         pshufd          $0x00,%xmm7,%xmm6
223         pshufd          $0x55,%xmm7,%xmm7
224         movq            0x20(%rdi),%xmm9
225         pshufd          $0x00,%xmm9,%xmm8
226         pshufd          $0x55,%xmm9,%xmm9
227         movq            0x28(%rdi),%xmm11
228         pshufd          $0x00,%xmm11,%xmm10
229         pshufd          $0x55,%xmm11,%xmm11
230         movq            0x30(%rdi),%xmm13
231         pshufd          $0x00,%xmm13,%xmm12
232         pshufd          $0x55,%xmm13,%xmm13
233         movq            0x38(%rdi),%xmm15
234         pshufd          $0x00,%xmm15,%xmm14
235         pshufd          $0x55,%xmm15,%xmm15
236         # x0..3 on stack
237         movdqa          %xmm0,0x00(%rsp)
238         movdqa          %xmm1,0x10(%rsp)
239         movdqa          %xmm2,0x20(%rsp)
240         movdqa          %xmm3,0x30(%rsp)
241
242         movdqa          CTRINC(%rip),%xmm1
243         movdqa          ROT8(%rip),%xmm2
244         movdqa          ROT16(%rip),%xmm3
245
246         # x12 += counter values 0-3
247         paddd           %xmm1,%xmm12
248
249         mov             $10,%ecx
250
251 .Ldoubleround4:
252         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
253         movdqa          0x00(%rsp),%xmm0
254         paddd           %xmm4,%xmm0
255         movdqa          %xmm0,0x00(%rsp)
256         pxor            %xmm0,%xmm12
257         pshufb          %xmm3,%xmm12
258         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
259         movdqa          0x10(%rsp),%xmm0
260         paddd           %xmm5,%xmm0
261         movdqa          %xmm0,0x10(%rsp)
262         pxor            %xmm0,%xmm13
263         pshufb          %xmm3,%xmm13
264         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
265         movdqa          0x20(%rsp),%xmm0
266         paddd           %xmm6,%xmm0
267         movdqa          %xmm0,0x20(%rsp)
268         pxor            %xmm0,%xmm14
269         pshufb          %xmm3,%xmm14
270         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
271         movdqa          0x30(%rsp),%xmm0
272         paddd           %xmm7,%xmm0
273         movdqa          %xmm0,0x30(%rsp)
274         pxor            %xmm0,%xmm15
275         pshufb          %xmm3,%xmm15
276
277         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
278         paddd           %xmm12,%xmm8
279         pxor            %xmm8,%xmm4
280         movdqa          %xmm4,%xmm0
281         pslld           $12,%xmm0
282         psrld           $20,%xmm4
283         por             %xmm0,%xmm4
284         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
285         paddd           %xmm13,%xmm9
286         pxor            %xmm9,%xmm5
287         movdqa          %xmm5,%xmm0
288         pslld           $12,%xmm0
289         psrld           $20,%xmm5
290         por             %xmm0,%xmm5
291         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
292         paddd           %xmm14,%xmm10
293         pxor            %xmm10,%xmm6
294         movdqa          %xmm6,%xmm0
295         pslld           $12,%xmm0
296         psrld           $20,%xmm6
297         por             %xmm0,%xmm6
298         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
299         paddd           %xmm15,%xmm11
300         pxor            %xmm11,%xmm7
301         movdqa          %xmm7,%xmm0
302         pslld           $12,%xmm0
303         psrld           $20,%xmm7
304         por             %xmm0,%xmm7
305
306         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
307         movdqa          0x00(%rsp),%xmm0
308         paddd           %xmm4,%xmm0
309         movdqa          %xmm0,0x00(%rsp)
310         pxor            %xmm0,%xmm12
311         pshufb          %xmm2,%xmm12
312         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
313         movdqa          0x10(%rsp),%xmm0
314         paddd           %xmm5,%xmm0
315         movdqa          %xmm0,0x10(%rsp)
316         pxor            %xmm0,%xmm13
317         pshufb          %xmm2,%xmm13
318         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
319         movdqa          0x20(%rsp),%xmm0
320         paddd           %xmm6,%xmm0
321         movdqa          %xmm0,0x20(%rsp)
322         pxor            %xmm0,%xmm14
323         pshufb          %xmm2,%xmm14
324         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
325         movdqa          0x30(%rsp),%xmm0
326         paddd           %xmm7,%xmm0
327         movdqa          %xmm0,0x30(%rsp)
328         pxor            %xmm0,%xmm15
329         pshufb          %xmm2,%xmm15
330
331         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
332         paddd           %xmm12,%xmm8
333         pxor            %xmm8,%xmm4
334         movdqa          %xmm4,%xmm0
335         pslld           $7,%xmm0
336         psrld           $25,%xmm4
337         por             %xmm0,%xmm4
338         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
339         paddd           %xmm13,%xmm9
340         pxor            %xmm9,%xmm5
341         movdqa          %xmm5,%xmm0
342         pslld           $7,%xmm0
343         psrld           $25,%xmm5
344         por             %xmm0,%xmm5
345         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
346         paddd           %xmm14,%xmm10
347         pxor            %xmm10,%xmm6
348         movdqa          %xmm6,%xmm0
349         pslld           $7,%xmm0
350         psrld           $25,%xmm6
351         por             %xmm0,%xmm6
352         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
353         paddd           %xmm15,%xmm11
354         pxor            %xmm11,%xmm7
355         movdqa          %xmm7,%xmm0
356         pslld           $7,%xmm0
357         psrld           $25,%xmm7
358         por             %xmm0,%xmm7
359
360         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
361         movdqa          0x00(%rsp),%xmm0
362         paddd           %xmm5,%xmm0
363         movdqa          %xmm0,0x00(%rsp)
364         pxor            %xmm0,%xmm15
365         pshufb          %xmm3,%xmm15
366         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
367         movdqa          0x10(%rsp),%xmm0
368         paddd           %xmm6,%xmm0
369         movdqa          %xmm0,0x10(%rsp)
370         pxor            %xmm0,%xmm12
371         pshufb          %xmm3,%xmm12
372         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
373         movdqa          0x20(%rsp),%xmm0
374         paddd           %xmm7,%xmm0
375         movdqa          %xmm0,0x20(%rsp)
376         pxor            %xmm0,%xmm13
377         pshufb          %xmm3,%xmm13
378         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379         movdqa          0x30(%rsp),%xmm0
380         paddd           %xmm4,%xmm0
381         movdqa          %xmm0,0x30(%rsp)
382         pxor            %xmm0,%xmm14
383         pshufb          %xmm3,%xmm14
384
385         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
386         paddd           %xmm15,%xmm10
387         pxor            %xmm10,%xmm5
388         movdqa          %xmm5,%xmm0
389         pslld           $12,%xmm0
390         psrld           $20,%xmm5
391         por             %xmm0,%xmm5
392         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
393         paddd           %xmm12,%xmm11
394         pxor            %xmm11,%xmm6
395         movdqa          %xmm6,%xmm0
396         pslld           $12,%xmm0
397         psrld           $20,%xmm6
398         por             %xmm0,%xmm6
399         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
400         paddd           %xmm13,%xmm8
401         pxor            %xmm8,%xmm7
402         movdqa          %xmm7,%xmm0
403         pslld           $12,%xmm0
404         psrld           $20,%xmm7
405         por             %xmm0,%xmm7
406         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
407         paddd           %xmm14,%xmm9
408         pxor            %xmm9,%xmm4
409         movdqa          %xmm4,%xmm0
410         pslld           $12,%xmm0
411         psrld           $20,%xmm4
412         por             %xmm0,%xmm4
413
414         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
415         movdqa          0x00(%rsp),%xmm0
416         paddd           %xmm5,%xmm0
417         movdqa          %xmm0,0x00(%rsp)
418         pxor            %xmm0,%xmm15
419         pshufb          %xmm2,%xmm15
420         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
421         movdqa          0x10(%rsp),%xmm0
422         paddd           %xmm6,%xmm0
423         movdqa          %xmm0,0x10(%rsp)
424         pxor            %xmm0,%xmm12
425         pshufb          %xmm2,%xmm12
426         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
427         movdqa          0x20(%rsp),%xmm0
428         paddd           %xmm7,%xmm0
429         movdqa          %xmm0,0x20(%rsp)
430         pxor            %xmm0,%xmm13
431         pshufb          %xmm2,%xmm13
432         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
433         movdqa          0x30(%rsp),%xmm0
434         paddd           %xmm4,%xmm0
435         movdqa          %xmm0,0x30(%rsp)
436         pxor            %xmm0,%xmm14
437         pshufb          %xmm2,%xmm14
438
439         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
440         paddd           %xmm15,%xmm10
441         pxor            %xmm10,%xmm5
442         movdqa          %xmm5,%xmm0
443         pslld           $7,%xmm0
444         psrld           $25,%xmm5
445         por             %xmm0,%xmm5
446         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
447         paddd           %xmm12,%xmm11
448         pxor            %xmm11,%xmm6
449         movdqa          %xmm6,%xmm0
450         pslld           $7,%xmm0
451         psrld           $25,%xmm6
452         por             %xmm0,%xmm6
453         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
454         paddd           %xmm13,%xmm8
455         pxor            %xmm8,%xmm7
456         movdqa          %xmm7,%xmm0
457         pslld           $7,%xmm0
458         psrld           $25,%xmm7
459         por             %xmm0,%xmm7
460         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
461         paddd           %xmm14,%xmm9
462         pxor            %xmm9,%xmm4
463         movdqa          %xmm4,%xmm0
464         pslld           $7,%xmm0
465         psrld           $25,%xmm4
466         por             %xmm0,%xmm4
467
468         dec             %ecx
469         jnz             .Ldoubleround4
470
471         # x0[0-3] += s0[0]
472         # x1[0-3] += s0[1]
473         movq            0x00(%rdi),%xmm3
474         pshufd          $0x00,%xmm3,%xmm2
475         pshufd          $0x55,%xmm3,%xmm3
476         paddd           0x00(%rsp),%xmm2
477         movdqa          %xmm2,0x00(%rsp)
478         paddd           0x10(%rsp),%xmm3
479         movdqa          %xmm3,0x10(%rsp)
480         # x2[0-3] += s0[2]
481         # x3[0-3] += s0[3]
482         movq            0x08(%rdi),%xmm3
483         pshufd          $0x00,%xmm3,%xmm2
484         pshufd          $0x55,%xmm3,%xmm3
485         paddd           0x20(%rsp),%xmm2
486         movdqa          %xmm2,0x20(%rsp)
487         paddd           0x30(%rsp),%xmm3
488         movdqa          %xmm3,0x30(%rsp)
489
490         # x4[0-3] += s1[0]
491         # x5[0-3] += s1[1]
492         movq            0x10(%rdi),%xmm3
493         pshufd          $0x00,%xmm3,%xmm2
494         pshufd          $0x55,%xmm3,%xmm3
495         paddd           %xmm2,%xmm4
496         paddd           %xmm3,%xmm5
497         # x6[0-3] += s1[2]
498         # x7[0-3] += s1[3]
499         movq            0x18(%rdi),%xmm3
500         pshufd          $0x00,%xmm3,%xmm2
501         pshufd          $0x55,%xmm3,%xmm3
502         paddd           %xmm2,%xmm6
503         paddd           %xmm3,%xmm7
504
505         # x8[0-3] += s2[0]
506         # x9[0-3] += s2[1]
507         movq            0x20(%rdi),%xmm3
508         pshufd          $0x00,%xmm3,%xmm2
509         pshufd          $0x55,%xmm3,%xmm3
510         paddd           %xmm2,%xmm8
511         paddd           %xmm3,%xmm9
512         # x10[0-3] += s2[2]
513         # x11[0-3] += s2[3]
514         movq            0x28(%rdi),%xmm3
515         pshufd          $0x00,%xmm3,%xmm2
516         pshufd          $0x55,%xmm3,%xmm3
517         paddd           %xmm2,%xmm10
518         paddd           %xmm3,%xmm11
519
520         # x12[0-3] += s3[0]
521         # x13[0-3] += s3[1]
522         movq            0x30(%rdi),%xmm3
523         pshufd          $0x00,%xmm3,%xmm2
524         pshufd          $0x55,%xmm3,%xmm3
525         paddd           %xmm2,%xmm12
526         paddd           %xmm3,%xmm13
527         # x14[0-3] += s3[2]
528         # x15[0-3] += s3[3]
529         movq            0x38(%rdi),%xmm3
530         pshufd          $0x00,%xmm3,%xmm2
531         pshufd          $0x55,%xmm3,%xmm3
532         paddd           %xmm2,%xmm14
533         paddd           %xmm3,%xmm15
534
535         # x12 += counter values 0-3
536         paddd           %xmm1,%xmm12
537
538         # interleave 32-bit words in state n, n+1
539         movdqa          0x00(%rsp),%xmm0
540         movdqa          0x10(%rsp),%xmm1
541         movdqa          %xmm0,%xmm2
542         punpckldq       %xmm1,%xmm2
543         punpckhdq       %xmm1,%xmm0
544         movdqa          %xmm2,0x00(%rsp)
545         movdqa          %xmm0,0x10(%rsp)
546         movdqa          0x20(%rsp),%xmm0
547         movdqa          0x30(%rsp),%xmm1
548         movdqa          %xmm0,%xmm2
549         punpckldq       %xmm1,%xmm2
550         punpckhdq       %xmm1,%xmm0
551         movdqa          %xmm2,0x20(%rsp)
552         movdqa          %xmm0,0x30(%rsp)
553         movdqa          %xmm4,%xmm0
554         punpckldq       %xmm5,%xmm4
555         punpckhdq       %xmm5,%xmm0
556         movdqa          %xmm0,%xmm5
557         movdqa          %xmm6,%xmm0
558         punpckldq       %xmm7,%xmm6
559         punpckhdq       %xmm7,%xmm0
560         movdqa          %xmm0,%xmm7
561         movdqa          %xmm8,%xmm0
562         punpckldq       %xmm9,%xmm8
563         punpckhdq       %xmm9,%xmm0
564         movdqa          %xmm0,%xmm9
565         movdqa          %xmm10,%xmm0
566         punpckldq       %xmm11,%xmm10
567         punpckhdq       %xmm11,%xmm0
568         movdqa          %xmm0,%xmm11
569         movdqa          %xmm12,%xmm0
570         punpckldq       %xmm13,%xmm12
571         punpckhdq       %xmm13,%xmm0
572         movdqa          %xmm0,%xmm13
573         movdqa          %xmm14,%xmm0
574         punpckldq       %xmm15,%xmm14
575         punpckhdq       %xmm15,%xmm0
576         movdqa          %xmm0,%xmm15
577
578         # interleave 64-bit words in state n, n+2
579         movdqa          0x00(%rsp),%xmm0
580         movdqa          0x20(%rsp),%xmm1
581         movdqa          %xmm0,%xmm2
582         punpcklqdq      %xmm1,%xmm2
583         punpckhqdq      %xmm1,%xmm0
584         movdqa          %xmm2,0x00(%rsp)
585         movdqa          %xmm0,0x20(%rsp)
586         movdqa          0x10(%rsp),%xmm0
587         movdqa          0x30(%rsp),%xmm1
588         movdqa          %xmm0,%xmm2
589         punpcklqdq      %xmm1,%xmm2
590         punpckhqdq      %xmm1,%xmm0
591         movdqa          %xmm2,0x10(%rsp)
592         movdqa          %xmm0,0x30(%rsp)
593         movdqa          %xmm4,%xmm0
594         punpcklqdq      %xmm6,%xmm4
595         punpckhqdq      %xmm6,%xmm0
596         movdqa          %xmm0,%xmm6
597         movdqa          %xmm5,%xmm0
598         punpcklqdq      %xmm7,%xmm5
599         punpckhqdq      %xmm7,%xmm0
600         movdqa          %xmm0,%xmm7
601         movdqa          %xmm8,%xmm0
602         punpcklqdq      %xmm10,%xmm8
603         punpckhqdq      %xmm10,%xmm0
604         movdqa          %xmm0,%xmm10
605         movdqa          %xmm9,%xmm0
606         punpcklqdq      %xmm11,%xmm9
607         punpckhqdq      %xmm11,%xmm0
608         movdqa          %xmm0,%xmm11
609         movdqa          %xmm12,%xmm0
610         punpcklqdq      %xmm14,%xmm12
611         punpckhqdq      %xmm14,%xmm0
612         movdqa          %xmm0,%xmm14
613         movdqa          %xmm13,%xmm0
614         punpcklqdq      %xmm15,%xmm13
615         punpckhqdq      %xmm15,%xmm0
616         movdqa          %xmm0,%xmm15
617
618         # xor with corresponding input, write to output
619         movdqa          0x00(%rsp),%xmm0
620         movdqu          0x00(%rdx),%xmm1
621         pxor            %xmm1,%xmm0
622         movdqu          %xmm0,0x00(%rsi)
623         movdqa          0x10(%rsp),%xmm0
624         movdqu          0x80(%rdx),%xmm1
625         pxor            %xmm1,%xmm0
626         movdqu          %xmm0,0x80(%rsi)
627         movdqa          0x20(%rsp),%xmm0
628         movdqu          0x40(%rdx),%xmm1
629         pxor            %xmm1,%xmm0
630         movdqu          %xmm0,0x40(%rsi)
631         movdqa          0x30(%rsp),%xmm0
632         movdqu          0xc0(%rdx),%xmm1
633         pxor            %xmm1,%xmm0
634         movdqu          %xmm0,0xc0(%rsi)
635         movdqu          0x10(%rdx),%xmm1
636         pxor            %xmm1,%xmm4
637         movdqu          %xmm4,0x10(%rsi)
638         movdqu          0x90(%rdx),%xmm1
639         pxor            %xmm1,%xmm5
640         movdqu          %xmm5,0x90(%rsi)
641         movdqu          0x50(%rdx),%xmm1
642         pxor            %xmm1,%xmm6
643         movdqu          %xmm6,0x50(%rsi)
644         movdqu          0xd0(%rdx),%xmm1
645         pxor            %xmm1,%xmm7
646         movdqu          %xmm7,0xd0(%rsi)
647         movdqu          0x20(%rdx),%xmm1
648         pxor            %xmm1,%xmm8
649         movdqu          %xmm8,0x20(%rsi)
650         movdqu          0xa0(%rdx),%xmm1
651         pxor            %xmm1,%xmm9
652         movdqu          %xmm9,0xa0(%rsi)
653         movdqu          0x60(%rdx),%xmm1
654         pxor            %xmm1,%xmm10
655         movdqu          %xmm10,0x60(%rsi)
656         movdqu          0xe0(%rdx),%xmm1
657         pxor            %xmm1,%xmm11
658         movdqu          %xmm11,0xe0(%rsi)
659         movdqu          0x30(%rdx),%xmm1
660         pxor            %xmm1,%xmm12
661         movdqu          %xmm12,0x30(%rsi)
662         movdqu          0xb0(%rdx),%xmm1
663         pxor            %xmm1,%xmm13
664         movdqu          %xmm13,0xb0(%rsi)
665         movdqu          0x70(%rdx),%xmm1
666         pxor            %xmm1,%xmm14
667         movdqu          %xmm14,0x70(%rsi)
668         movdqu          0xf0(%rdx),%xmm1
669         pxor            %xmm1,%xmm15
670         movdqu          %xmm15,0xf0(%rsi)
671
672         lea             -8(%r10),%rsp
673         ret
674 ENDPROC(chacha20_4block_xor_ssse3)