1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
187 #define InLen (16*1)+8
188 #define PBlockEncKey 16*2
190 #define CurCount 16*4
191 #define PBlockLen 16*5
193 HashKey = 16*6 # store HashKey <<1 mod poly here
194 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
195 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
196 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
197 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
198 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
199 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
200 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
201 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
216 #define arg7 STACK_OFFSET+8*1(%r14)
217 #define arg8 STACK_OFFSET+8*2(%r14)
218 #define arg9 STACK_OFFSET+8*3(%r14)
219 #define arg10 STACK_OFFSET+8*4(%r14)
220 #define keysize 2*15*16(arg1)
230 .macro define_reg r n
241 # need to push 4 registers into stack to maintain
244 TMP1 = 16*0 # Temporary storage for AAD
245 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246 TMP3 = 16*2 # Temporary storage for AES State 3
247 TMP4 = 16*3 # Temporary storage for AES State 4
248 TMP5 = 16*4 # Temporary storage for AES State 5
249 TMP6 = 16*5 # Temporary storage for AES State 6
250 TMP7 = 16*6 # Temporary storage for AES State 7
251 TMP8 = 16*7 # Temporary storage for AES State 8
253 VARIABLE_OFFSET = 16*8
255 ################################
257 ################################
260 #the number of pushes must equal STACK_OFFSET
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp # align rsp to 64 bytes
283 # Encryption of a single block
284 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
289 vaesenc 16*i(arg1), \XMM0, \XMM0
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
296 # combined for GCM encrypt and decrypt functions
297 # clobbering all xmm registers
298 # clobbering r10, r11, r12, r13, r14, r15
299 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu AadHash(arg2), %xmm8
301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
302 add arg5, InLen(arg2)
304 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
305 and $-16, %r13 # r13 = r13 - (r13 mod 16)
310 jz _initial_num_blocks_is_0\@
313 je _initial_num_blocks_is_7\@
315 je _initial_num_blocks_is_6\@
317 je _initial_num_blocks_is_5\@
319 je _initial_num_blocks_is_4\@
321 je _initial_num_blocks_is_3\@
323 je _initial_num_blocks_is_2\@
325 jmp _initial_num_blocks_is_1\@
327 _initial_num_blocks_is_7\@:
328 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
330 jmp _initial_blocks_encrypted\@
332 _initial_num_blocks_is_6\@:
333 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335 jmp _initial_blocks_encrypted\@
337 _initial_num_blocks_is_5\@:
338 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340 jmp _initial_blocks_encrypted\@
342 _initial_num_blocks_is_4\@:
343 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345 jmp _initial_blocks_encrypted\@
347 _initial_num_blocks_is_3\@:
348 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350 jmp _initial_blocks_encrypted\@
352 _initial_num_blocks_is_2\@:
353 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355 jmp _initial_blocks_encrypted\@
357 _initial_num_blocks_is_1\@:
358 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
360 jmp _initial_blocks_encrypted\@
362 _initial_num_blocks_is_0\@:
363 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
366 _initial_blocks_encrypted\@:
368 je _zero_cipher_left\@
371 je _eight_cipher_left\@
378 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
388 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
391 jne _encrypt_by_8_new\@
393 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
394 jmp _eight_cipher_left\@
397 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
400 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403 jne _encrypt_by_8_new\@
405 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
410 _eight_cipher_left\@:
411 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
415 vmovdqu %xmm14, AadHash(arg2)
416 vmovdqu %xmm9, CurCount(arg2)
419 jl _only_less_than_16\@
422 and $15, %r13 # r13 = (arg5 mod 16)
424 je _multiple_of_16_bytes\@
426 # handle the last <16 Byte block seperately
428 mov %r13, PBlockLen(arg2)
430 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
431 vmovdqu %xmm9, CurCount(arg2)
432 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
434 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
435 vmovdqu %xmm9, PBlockEncKey(arg2)
439 vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block
441 lea SHIFT_MASK+16(%rip), %r12
442 sub %r13, %r12 # adjust the shuffle mask pointer to be
443 # able to shift 16-r13 bytes (r13 is the
444 # number of bytes in plaintext mod 16)
445 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
446 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
447 jmp _final_ghash_mul\@
449 _only_less_than_16\@:
452 and $15, %r13 # r13 = (arg5 mod 16)
454 je _multiple_of_16_bytes\@
456 # handle the last <16 Byte block separately
459 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
460 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
461 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
463 vmovdqu %xmm9, PBlockEncKey(arg2)
465 lea SHIFT_MASK+16(%rip), %r12
466 sub %r13, %r12 # adjust the shuffle mask pointer to be
467 # able to shift 16-r13 bytes (r13 is the
468 # number of bytes in plaintext mod 16)
470 _get_last_16_byte_loop\@:
471 movb (arg4, %r11), %al
472 movb %al, TMP1 (%rsp , %r11)
475 jne _get_last_16_byte_loop\@
477 vmovdqu TMP1(%rsp), %xmm1
484 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
485 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
486 # mask out top 16-r13 bytes of xmm9
487 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
488 vpand %xmm1, %xmm2, %xmm2
489 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
490 vpxor %xmm2, %xmm14, %xmm14
492 vmovdqu %xmm14, AadHash(arg2)
496 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
497 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
498 # mask out top 16-r13 bytes of xmm9
499 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
500 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
501 vpxor %xmm9, %xmm14, %xmm14
503 vmovdqu %xmm14, AadHash(arg2)
506 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
510 #############################
514 jle _less_than_8_bytes_left\@
516 mov %rax, (arg3 , %r11)
518 vpsrldq $8, %xmm9, %xmm9
522 _less_than_8_bytes_left\@:
523 movb %al, (arg3 , %r11)
527 jne _less_than_8_bytes_left\@
528 #############################
530 _multiple_of_16_bytes\@:
531 GCM_COMPLETE \GHASH_MUL \REP
535 # GCM_COMPLETE Finishes update of tag of last partial block
536 # Output: Authorization Tag (AUTH_TAG)
537 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
538 .macro GCM_COMPLETE GHASH_MUL REP
539 vmovdqu AadHash(arg2), %xmm14
540 vmovdqu HashKey(arg2), %xmm13
542 mov PBlockLen(arg2), %r12
546 #GHASH computation for the last <16 Byte block
547 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
550 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
551 shl $3, %r12 # convert into number of bits
552 vmovd %r12d, %xmm15 # len(A) in xmm15
554 mov InLen(arg2), %r12
555 shl $3, %r12 # len(C) in bits (*128)
557 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
558 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
560 vpxor %xmm15, %xmm14, %xmm14
561 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
562 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
564 vmovdqu OrigIV(arg2), %xmm9
566 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
568 vpxor %xmm14, %xmm9, %xmm9
573 mov arg9, %r10 # r10 = authTag
574 mov arg10, %r11 # r11 = auth_tag_len
587 vpsrldq $8, %xmm9, %xmm9
595 vpsrldq $4, %xmm9, %xmm9
612 vmovdqu %xmm9, (%r10)
617 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
619 mov \AAD, %r10 # r10 = AAD
620 mov \AADLEN, %r12 # r12 = aadLen
631 vpshufb SHUF_MASK(%rip), \T7, \T7
633 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
638 jge _get_AAD_blocks\@
645 /* read the last <16B of AAD. since we have at least 4B of
646 data right after the AAD (the ICV, and maybe some CT), we can
647 read 4B/8B blocks safely, and then get rid of the extra stuff */
665 vpslldq $12, \T1, \T1
669 /* finalize: shift out the extra bytes we read, and align
670 left. since pslldq can only shift by an immediate, we use
671 vpshufb and an array of shuffle masks */
674 vmovdqu aad_shift_arr(%r11), \T1
675 vpshufb \T1, \T7, \T7
676 _get_AAD_rest_final\@:
677 vpshufb SHUF_MASK(%rip), \T7, \T7
679 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
682 vmovdqu \T7, AadHash(arg2)
685 .macro INIT GHASH_MUL PRECOMPUTE
687 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
689 mov %r11, InLen(arg2) # ctx_data.in_length = 0
691 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
692 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
695 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
697 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
698 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
700 vmovdqu (arg3), %xmm6 # xmm6 = HashKey
702 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
703 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
705 vpsllq $1, %xmm6, %xmm6
706 vpsrlq $63, %xmm2, %xmm2
708 vpslldq $8, %xmm2, %xmm2
709 vpsrldq $8, %xmm1, %xmm1
710 vpor %xmm2, %xmm6, %xmm6
712 vpshufd $0b00100100, %xmm1, %xmm2
713 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
714 vpand POLY(%rip), %xmm2, %xmm2
715 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
716 #######################################################################
717 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
719 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
721 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
725 ###############################################################################
726 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
727 # Input: A and B (128-bits each, bit-reflected)
728 # Output: C = A*B*x mod poly, (i.e. >>1 )
729 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
730 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
731 ###############################################################################
732 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
734 vpshufd $0b01001110, \GH, \T2
735 vpshufd $0b01001110, \HK, \T3
736 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
737 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
739 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
740 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
741 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
743 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
745 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
746 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
748 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
750 #first phase of the reduction
751 vpslld $31, \GH, \T2 # packed right shifting << 31
752 vpslld $30, \GH, \T3 # packed right shifting shift << 30
753 vpslld $25, \GH, \T4 # packed right shifting shift << 25
755 vpxor \T3, \T2, \T2 # xor the shifted versions
758 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
760 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
761 vpxor \T2, \GH, \GH # first phase of the reduction complete
763 #second phase of the reduction
765 vpsrld $1,\GH, \T2 # packed left shifting >> 1
766 vpsrld $2,\GH, \T3 # packed left shifting >> 2
767 vpsrld $7,\GH, \T4 # packed left shifting >> 7
768 vpxor \T3, \T2, \T2 # xor the shifted versions
773 vpxor \T1, \GH, \GH # the result is in GH
778 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
780 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
783 vpshufd $0b01001110, \T5, \T1
785 vmovdqu \T1, HashKey_k(arg2)
787 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
788 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
789 vpshufd $0b01001110, \T5, \T1
791 vmovdqu \T1, HashKey_2_k(arg2)
793 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
794 vmovdqu \T5, HashKey_3(arg2)
795 vpshufd $0b01001110, \T5, \T1
797 vmovdqu \T1, HashKey_3_k(arg2)
799 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
800 vmovdqu \T5, HashKey_4(arg2)
801 vpshufd $0b01001110, \T5, \T1
803 vmovdqu \T1, HashKey_4_k(arg2)
805 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
806 vmovdqu \T5, HashKey_5(arg2)
807 vpshufd $0b01001110, \T5, \T1
809 vmovdqu \T1, HashKey_5_k(arg2)
811 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
812 vmovdqu \T5, HashKey_6(arg2)
813 vpshufd $0b01001110, \T5, \T1
815 vmovdqu \T1, HashKey_6_k(arg2)
817 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
818 vmovdqu \T5, HashKey_7(arg2)
819 vpshufd $0b01001110, \T5, \T1
821 vmovdqu \T1, HashKey_7_k(arg2)
823 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
824 vmovdqu \T5, HashKey_8(arg2)
825 vpshufd $0b01001110, \T5, \T1
827 vmovdqu \T1, HashKey_8_k(arg2)
831 ## if a = number of total plaintext bytes
833 ## num_initial_blocks = b mod 4#
834 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
835 ## r10, r11, r12, rax are clobbered
836 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
838 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
839 i = (8-\num_initial_blocks)
841 vmovdqu AadHash(arg2), reg_i
843 # initialize the data pointer offset as zero
846 # start AES for num_initial_blocks blocks
847 vmovdqu CurCount(arg2), \CTR
849 i = (9-\num_initial_blocks)
851 .rep \num_initial_blocks
852 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
854 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
859 vmovdqa (arg1), \T_key
860 i = (9-\num_initial_blocks)
862 .rep \num_initial_blocks
863 vpxor \T_key, reg_i, reg_i
871 vmovdqa 16*j(arg1), \T_key
872 i = (9-\num_initial_blocks)
874 .rep \num_initial_blocks
875 vaesenc \T_key, reg_i, reg_i
884 vmovdqa 16*j(arg1), \T_key
885 i = (9-\num_initial_blocks)
887 .rep \num_initial_blocks
888 vaesenclast \T_key, reg_i, reg_i
893 i = (9-\num_initial_blocks)
895 .rep \num_initial_blocks
896 vmovdqu (arg4, %r11), \T1
897 vpxor \T1, reg_i, reg_i
898 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
903 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
909 i = (8-\num_initial_blocks)
910 j = (9-\num_initial_blocks)
913 .rep \num_initial_blocks
914 vpxor reg_i, reg_j, reg_j
915 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
920 # XMM8 has the combined result here
922 vmovdqa \XMM8, TMP1(%rsp)
926 jl _initial_blocks_done\@ # no need for precomputed constants
928 ###############################################################################
929 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
930 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
932 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
934 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
936 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
938 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
940 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
942 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
944 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
946 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
948 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
950 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
952 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
954 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
956 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
958 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
960 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
962 vmovdqa (arg1), \T_key
963 vpxor \T_key, \XMM1, \XMM1
964 vpxor \T_key, \XMM2, \XMM2
965 vpxor \T_key, \XMM3, \XMM3
966 vpxor \T_key, \XMM4, \XMM4
967 vpxor \T_key, \XMM5, \XMM5
968 vpxor \T_key, \XMM6, \XMM6
969 vpxor \T_key, \XMM7, \XMM7
970 vpxor \T_key, \XMM8, \XMM8
974 .rep \REP # do REP rounds
975 vmovdqa 16*i(arg1), \T_key
976 vaesenc \T_key, \XMM1, \XMM1
977 vaesenc \T_key, \XMM2, \XMM2
978 vaesenc \T_key, \XMM3, \XMM3
979 vaesenc \T_key, \XMM4, \XMM4
980 vaesenc \T_key, \XMM5, \XMM5
981 vaesenc \T_key, \XMM6, \XMM6
982 vaesenc \T_key, \XMM7, \XMM7
983 vaesenc \T_key, \XMM8, \XMM8
988 vmovdqa 16*i(arg1), \T_key
989 vaesenclast \T_key, \XMM1, \XMM1
990 vaesenclast \T_key, \XMM2, \XMM2
991 vaesenclast \T_key, \XMM3, \XMM3
992 vaesenclast \T_key, \XMM4, \XMM4
993 vaesenclast \T_key, \XMM5, \XMM5
994 vaesenclast \T_key, \XMM6, \XMM6
995 vaesenclast \T_key, \XMM7, \XMM7
996 vaesenclast \T_key, \XMM8, \XMM8
998 vmovdqu (arg4, %r11), \T1
999 vpxor \T1, \XMM1, \XMM1
1000 vmovdqu \XMM1, (arg3 , %r11)
1005 vmovdqu 16*1(arg4, %r11), \T1
1006 vpxor \T1, \XMM2, \XMM2
1007 vmovdqu \XMM2, 16*1(arg3 , %r11)
1012 vmovdqu 16*2(arg4, %r11), \T1
1013 vpxor \T1, \XMM3, \XMM3
1014 vmovdqu \XMM3, 16*2(arg3 , %r11)
1019 vmovdqu 16*3(arg4, %r11), \T1
1020 vpxor \T1, \XMM4, \XMM4
1021 vmovdqu \XMM4, 16*3(arg3 , %r11)
1026 vmovdqu 16*4(arg4, %r11), \T1
1027 vpxor \T1, \XMM5, \XMM5
1028 vmovdqu \XMM5, 16*4(arg3 , %r11)
1033 vmovdqu 16*5(arg4, %r11), \T1
1034 vpxor \T1, \XMM6, \XMM6
1035 vmovdqu \XMM6, 16*5(arg3 , %r11)
1040 vmovdqu 16*6(arg4, %r11), \T1
1041 vpxor \T1, \XMM7, \XMM7
1042 vmovdqu \XMM7, 16*6(arg3 , %r11)
1047 vmovdqu 16*7(arg4, %r11), \T1
1048 vpxor \T1, \XMM8, \XMM8
1049 vmovdqu \XMM8, 16*7(arg3 , %r11)
1056 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1057 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1058 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1059 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1060 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1061 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1062 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1063 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1064 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1066 ###############################################################################
1068 _initial_blocks_done\@:
1072 # encrypt 8 blocks at a time
1073 # ghash the 8 previously encrypted ciphertext blocks
1074 # arg1, arg3, arg4 are used as pointers only, not modified
1075 # r11 is the data offset value
1076 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1079 vmovdqa \XMM2, TMP2(%rsp)
1080 vmovdqa \XMM3, TMP3(%rsp)
1081 vmovdqa \XMM4, TMP4(%rsp)
1082 vmovdqa \XMM5, TMP5(%rsp)
1083 vmovdqa \XMM6, TMP6(%rsp)
1084 vmovdqa \XMM7, TMP7(%rsp)
1085 vmovdqa \XMM8, TMP8(%rsp)
1087 .if \loop_idx == in_order
1088 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1089 vpaddd ONE(%rip), \XMM1, \XMM2
1090 vpaddd ONE(%rip), \XMM2, \XMM3
1091 vpaddd ONE(%rip), \XMM3, \XMM4
1092 vpaddd ONE(%rip), \XMM4, \XMM5
1093 vpaddd ONE(%rip), \XMM5, \XMM6
1094 vpaddd ONE(%rip), \XMM6, \XMM7
1095 vpaddd ONE(%rip), \XMM7, \XMM8
1098 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1099 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1100 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1101 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1102 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1103 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1104 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1105 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1107 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1108 vpaddd ONEf(%rip), \XMM1, \XMM2
1109 vpaddd ONEf(%rip), \XMM2, \XMM3
1110 vpaddd ONEf(%rip), \XMM3, \XMM4
1111 vpaddd ONEf(%rip), \XMM4, \XMM5
1112 vpaddd ONEf(%rip), \XMM5, \XMM6
1113 vpaddd ONEf(%rip), \XMM6, \XMM7
1114 vpaddd ONEf(%rip), \XMM7, \XMM8
1119 #######################################################################
1122 vpxor \T1, \XMM1, \XMM1
1123 vpxor \T1, \XMM2, \XMM2
1124 vpxor \T1, \XMM3, \XMM3
1125 vpxor \T1, \XMM4, \XMM4
1126 vpxor \T1, \XMM5, \XMM5
1127 vpxor \T1, \XMM6, \XMM6
1128 vpxor \T1, \XMM7, \XMM7
1129 vpxor \T1, \XMM8, \XMM8
1131 #######################################################################
1137 vmovdqu 16*1(arg1), \T1
1138 vaesenc \T1, \XMM1, \XMM1
1139 vaesenc \T1, \XMM2, \XMM2
1140 vaesenc \T1, \XMM3, \XMM3
1141 vaesenc \T1, \XMM4, \XMM4
1142 vaesenc \T1, \XMM5, \XMM5
1143 vaesenc \T1, \XMM6, \XMM6
1144 vaesenc \T1, \XMM7, \XMM7
1145 vaesenc \T1, \XMM8, \XMM8
1147 vmovdqu 16*2(arg1), \T1
1148 vaesenc \T1, \XMM1, \XMM1
1149 vaesenc \T1, \XMM2, \XMM2
1150 vaesenc \T1, \XMM3, \XMM3
1151 vaesenc \T1, \XMM4, \XMM4
1152 vaesenc \T1, \XMM5, \XMM5
1153 vaesenc \T1, \XMM6, \XMM6
1154 vaesenc \T1, \XMM7, \XMM7
1155 vaesenc \T1, \XMM8, \XMM8
1158 #######################################################################
1160 vmovdqu HashKey_8(arg2), \T5
1161 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1162 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1164 vpshufd $0b01001110, \T2, \T6
1167 vmovdqu HashKey_8_k(arg2), \T5
1168 vpclmulqdq $0x00, \T5, \T6, \T6
1170 vmovdqu 16*3(arg1), \T1
1171 vaesenc \T1, \XMM1, \XMM1
1172 vaesenc \T1, \XMM2, \XMM2
1173 vaesenc \T1, \XMM3, \XMM3
1174 vaesenc \T1, \XMM4, \XMM4
1175 vaesenc \T1, \XMM5, \XMM5
1176 vaesenc \T1, \XMM6, \XMM6
1177 vaesenc \T1, \XMM7, \XMM7
1178 vaesenc \T1, \XMM8, \XMM8
1180 vmovdqa TMP2(%rsp), \T1
1181 vmovdqu HashKey_7(arg2), \T5
1182 vpclmulqdq $0x11, \T5, \T1, \T3
1184 vpclmulqdq $0x00, \T5, \T1, \T3
1187 vpshufd $0b01001110, \T1, \T3
1189 vmovdqu HashKey_7_k(arg2), \T5
1190 vpclmulqdq $0x10, \T5, \T3, \T3
1193 vmovdqu 16*4(arg1), \T1
1194 vaesenc \T1, \XMM1, \XMM1
1195 vaesenc \T1, \XMM2, \XMM2
1196 vaesenc \T1, \XMM3, \XMM3
1197 vaesenc \T1, \XMM4, \XMM4
1198 vaesenc \T1, \XMM5, \XMM5
1199 vaesenc \T1, \XMM6, \XMM6
1200 vaesenc \T1, \XMM7, \XMM7
1201 vaesenc \T1, \XMM8, \XMM8
1203 #######################################################################
1205 vmovdqa TMP3(%rsp), \T1
1206 vmovdqu HashKey_6(arg2), \T5
1207 vpclmulqdq $0x11, \T5, \T1, \T3
1209 vpclmulqdq $0x00, \T5, \T1, \T3
1212 vpshufd $0b01001110, \T1, \T3
1214 vmovdqu HashKey_6_k(arg2), \T5
1215 vpclmulqdq $0x10, \T5, \T3, \T3
1218 vmovdqu 16*5(arg1), \T1
1219 vaesenc \T1, \XMM1, \XMM1
1220 vaesenc \T1, \XMM2, \XMM2
1221 vaesenc \T1, \XMM3, \XMM3
1222 vaesenc \T1, \XMM4, \XMM4
1223 vaesenc \T1, \XMM5, \XMM5
1224 vaesenc \T1, \XMM6, \XMM6
1225 vaesenc \T1, \XMM7, \XMM7
1226 vaesenc \T1, \XMM8, \XMM8
1228 vmovdqa TMP4(%rsp), \T1
1229 vmovdqu HashKey_5(arg2), \T5
1230 vpclmulqdq $0x11, \T5, \T1, \T3
1232 vpclmulqdq $0x00, \T5, \T1, \T3
1235 vpshufd $0b01001110, \T1, \T3
1237 vmovdqu HashKey_5_k(arg2), \T5
1238 vpclmulqdq $0x10, \T5, \T3, \T3
1241 vmovdqu 16*6(arg1), \T1
1242 vaesenc \T1, \XMM1, \XMM1
1243 vaesenc \T1, \XMM2, \XMM2
1244 vaesenc \T1, \XMM3, \XMM3
1245 vaesenc \T1, \XMM4, \XMM4
1246 vaesenc \T1, \XMM5, \XMM5
1247 vaesenc \T1, \XMM6, \XMM6
1248 vaesenc \T1, \XMM7, \XMM7
1249 vaesenc \T1, \XMM8, \XMM8
1252 vmovdqa TMP5(%rsp), \T1
1253 vmovdqu HashKey_4(arg2), \T5
1254 vpclmulqdq $0x11, \T5, \T1, \T3
1256 vpclmulqdq $0x00, \T5, \T1, \T3
1259 vpshufd $0b01001110, \T1, \T3
1261 vmovdqu HashKey_4_k(arg2), \T5
1262 vpclmulqdq $0x10, \T5, \T3, \T3
1265 vmovdqu 16*7(arg1), \T1
1266 vaesenc \T1, \XMM1, \XMM1
1267 vaesenc \T1, \XMM2, \XMM2
1268 vaesenc \T1, \XMM3, \XMM3
1269 vaesenc \T1, \XMM4, \XMM4
1270 vaesenc \T1, \XMM5, \XMM5
1271 vaesenc \T1, \XMM6, \XMM6
1272 vaesenc \T1, \XMM7, \XMM7
1273 vaesenc \T1, \XMM8, \XMM8
1275 vmovdqa TMP6(%rsp), \T1
1276 vmovdqu HashKey_3(arg2), \T5
1277 vpclmulqdq $0x11, \T5, \T1, \T3
1279 vpclmulqdq $0x00, \T5, \T1, \T3
1282 vpshufd $0b01001110, \T1, \T3
1284 vmovdqu HashKey_3_k(arg2), \T5
1285 vpclmulqdq $0x10, \T5, \T3, \T3
1289 vmovdqu 16*8(arg1), \T1
1290 vaesenc \T1, \XMM1, \XMM1
1291 vaesenc \T1, \XMM2, \XMM2
1292 vaesenc \T1, \XMM3, \XMM3
1293 vaesenc \T1, \XMM4, \XMM4
1294 vaesenc \T1, \XMM5, \XMM5
1295 vaesenc \T1, \XMM6, \XMM6
1296 vaesenc \T1, \XMM7, \XMM7
1297 vaesenc \T1, \XMM8, \XMM8
1299 vmovdqa TMP7(%rsp), \T1
1300 vmovdqu HashKey_2(arg2), \T5
1301 vpclmulqdq $0x11, \T5, \T1, \T3
1303 vpclmulqdq $0x00, \T5, \T1, \T3
1306 vpshufd $0b01001110, \T1, \T3
1308 vmovdqu HashKey_2_k(arg2), \T5
1309 vpclmulqdq $0x10, \T5, \T3, \T3
1312 #######################################################################
1314 vmovdqu 16*9(arg1), \T5
1315 vaesenc \T5, \XMM1, \XMM1
1316 vaesenc \T5, \XMM2, \XMM2
1317 vaesenc \T5, \XMM3, \XMM3
1318 vaesenc \T5, \XMM4, \XMM4
1319 vaesenc \T5, \XMM5, \XMM5
1320 vaesenc \T5, \XMM6, \XMM6
1321 vaesenc \T5, \XMM7, \XMM7
1322 vaesenc \T5, \XMM8, \XMM8
1324 vmovdqa TMP8(%rsp), \T1
1325 vmovdqu HashKey(arg2), \T5
1326 vpclmulqdq $0x11, \T5, \T1, \T3
1328 vpclmulqdq $0x00, \T5, \T1, \T3
1331 vpshufd $0b01001110, \T1, \T3
1333 vmovdqu HashKey_k(arg2), \T5
1334 vpclmulqdq $0x10, \T5, \T3, \T3
1340 vmovdqu 16*10(arg1), \T5
1346 vaesenc \T5, \XMM1, \XMM1
1347 vaesenc \T5, \XMM2, \XMM2
1348 vaesenc \T5, \XMM3, \XMM3
1349 vaesenc \T5, \XMM4, \XMM4
1350 vaesenc \T5, \XMM5, \XMM5
1351 vaesenc \T5, \XMM6, \XMM6
1352 vaesenc \T5, \XMM7, \XMM7
1353 vaesenc \T5, \XMM8, \XMM8
1355 vmovdqu 16*i(arg1), \T5
1364 vpxor 16*i(arg4, %r11), \T5, \T2
1366 vaesenclast \T2, reg_j, reg_j
1368 vaesenclast \T2, reg_j, \T3
1369 vmovdqu 16*i(arg4, %r11), reg_j
1370 vmovdqu \T3, 16*i(arg3, %r11)
1376 #######################################################################
1379 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1380 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1382 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1386 #######################################################################
1387 #first phase of the reduction
1388 #######################################################################
1389 vpslld $31, \T7, \T2 # packed right shifting << 31
1390 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1391 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1393 vpxor \T3, \T2, \T2 # xor the shifted versions
1396 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1398 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1399 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1400 #######################################################################
1402 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1403 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1404 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1405 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1406 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1407 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1408 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1409 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1412 #######################################################################
1413 #second phase of the reduction
1414 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1415 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1416 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1417 vpxor \T3, \T2, \T2 # xor the shifted versions
1422 vpxor \T7, \T6, \T6 # the result is in T6
1423 #######################################################################
1425 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1426 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1427 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1428 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1429 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1430 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1431 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1432 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1435 vpxor \T6, \XMM1, \XMM1
1442 # GHASH the last 4 ciphertext blocks.
1443 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1448 vpshufd $0b01001110, \XMM1, \T2
1449 vpxor \XMM1, \T2, \T2
1450 vmovdqu HashKey_8(arg2), \T5
1451 vpclmulqdq $0x11, \T5, \XMM1, \T6
1452 vpclmulqdq $0x00, \T5, \XMM1, \T7
1454 vmovdqu HashKey_8_k(arg2), \T3
1455 vpclmulqdq $0x00, \T3, \T2, \XMM1
1457 ######################
1459 vpshufd $0b01001110, \XMM2, \T2
1460 vpxor \XMM2, \T2, \T2
1461 vmovdqu HashKey_7(arg2), \T5
1462 vpclmulqdq $0x11, \T5, \XMM2, \T4
1465 vpclmulqdq $0x00, \T5, \XMM2, \T4
1468 vmovdqu HashKey_7_k(arg2), \T3
1469 vpclmulqdq $0x00, \T3, \T2, \T2
1470 vpxor \T2, \XMM1, \XMM1
1472 ######################
1474 vpshufd $0b01001110, \XMM3, \T2
1475 vpxor \XMM3, \T2, \T2
1476 vmovdqu HashKey_6(arg2), \T5
1477 vpclmulqdq $0x11, \T5, \XMM3, \T4
1480 vpclmulqdq $0x00, \T5, \XMM3, \T4
1483 vmovdqu HashKey_6_k(arg2), \T3
1484 vpclmulqdq $0x00, \T3, \T2, \T2
1485 vpxor \T2, \XMM1, \XMM1
1487 ######################
1489 vpshufd $0b01001110, \XMM4, \T2
1490 vpxor \XMM4, \T2, \T2
1491 vmovdqu HashKey_5(arg2), \T5
1492 vpclmulqdq $0x11, \T5, \XMM4, \T4
1495 vpclmulqdq $0x00, \T5, \XMM4, \T4
1498 vmovdqu HashKey_5_k(arg2), \T3
1499 vpclmulqdq $0x00, \T3, \T2, \T2
1500 vpxor \T2, \XMM1, \XMM1
1502 ######################
1504 vpshufd $0b01001110, \XMM5, \T2
1505 vpxor \XMM5, \T2, \T2
1506 vmovdqu HashKey_4(arg2), \T5
1507 vpclmulqdq $0x11, \T5, \XMM5, \T4
1510 vpclmulqdq $0x00, \T5, \XMM5, \T4
1513 vmovdqu HashKey_4_k(arg2), \T3
1514 vpclmulqdq $0x00, \T3, \T2, \T2
1515 vpxor \T2, \XMM1, \XMM1
1517 ######################
1519 vpshufd $0b01001110, \XMM6, \T2
1520 vpxor \XMM6, \T2, \T2
1521 vmovdqu HashKey_3(arg2), \T5
1522 vpclmulqdq $0x11, \T5, \XMM6, \T4
1525 vpclmulqdq $0x00, \T5, \XMM6, \T4
1528 vmovdqu HashKey_3_k(arg2), \T3
1529 vpclmulqdq $0x00, \T3, \T2, \T2
1530 vpxor \T2, \XMM1, \XMM1
1532 ######################
1534 vpshufd $0b01001110, \XMM7, \T2
1535 vpxor \XMM7, \T2, \T2
1536 vmovdqu HashKey_2(arg2), \T5
1537 vpclmulqdq $0x11, \T5, \XMM7, \T4
1540 vpclmulqdq $0x00, \T5, \XMM7, \T4
1543 vmovdqu HashKey_2_k(arg2), \T3
1544 vpclmulqdq $0x00, \T3, \T2, \T2
1545 vpxor \T2, \XMM1, \XMM1
1547 ######################
1549 vpshufd $0b01001110, \XMM8, \T2
1550 vpxor \XMM8, \T2, \T2
1551 vmovdqu HashKey(arg2), \T5
1552 vpclmulqdq $0x11, \T5, \XMM8, \T4
1555 vpclmulqdq $0x00, \T5, \XMM8, \T4
1558 vmovdqu HashKey_k(arg2), \T3
1559 vpclmulqdq $0x00, \T3, \T2, \T2
1561 vpxor \T2, \XMM1, \XMM1
1562 vpxor \T6, \XMM1, \XMM1
1563 vpxor \T7, \XMM1, \T2
1568 vpslldq $8, \T2, \T4
1569 vpsrldq $8, \T2, \T2
1572 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1573 # the accumulated carry-less multiplications
1575 #######################################################################
1576 #first phase of the reduction
1577 vpslld $31, \T7, \T2 # packed right shifting << 31
1578 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1579 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1581 vpxor \T3, \T2, \T2 # xor the shifted versions
1584 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1586 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1587 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1588 #######################################################################
1591 #second phase of the reduction
1592 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1593 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1594 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1595 vpxor \T3, \T2, \T2 # xor the shifted versions
1600 vpxor \T7, \T6, \T6 # the result is in T6
1604 #############################################################
1605 #void aesni_gcm_precomp_avx_gen2
1606 # (gcm_data *my_ctx_data,
1607 # gcm_context_data *data,
1608 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1609 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1610 # (from Security Association) concatenated with 8 byte
1611 # Initialisation Vector (from IPSec ESP Payload)
1612 # concatenated with 0x00000001. 16-byte aligned pointer. */
1613 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1614 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1615 #############################################################
1616 ENTRY(aesni_gcm_precomp_avx_gen2)
1618 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1621 ENDPROC(aesni_gcm_precomp_avx_gen2)
1623 ###############################################################################
1624 #void aesni_gcm_enc_avx_gen2(
1625 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1626 # gcm_context_data *data,
1627 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1628 # const u8 *in, /* Plaintext input */
1629 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1630 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1631 # (from Security Association) concatenated with 8 byte
1632 # Initialisation Vector (from IPSec ESP Payload)
1633 # concatenated with 0x00000001. 16-byte aligned pointer. */
1634 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1635 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1636 # u8 *auth_tag, /* Authenticated Tag output. */
1637 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1638 # Valid values are 16 (most likely), 12 or 8. */
1639 ###############################################################################
1640 ENTRY(aesni_gcm_enc_avx_gen2)
1648 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1652 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1656 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1659 ENDPROC(aesni_gcm_enc_avx_gen2)
1661 ###############################################################################
1662 #void aesni_gcm_dec_avx_gen2(
1663 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1664 # gcm_context_data *data,
1665 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1666 # const u8 *in, /* Ciphertext input */
1667 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1668 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1669 # (from Security Association) concatenated with 8 byte
1670 # Initialisation Vector (from IPSec ESP Payload)
1671 # concatenated with 0x00000001. 16-byte aligned pointer. */
1672 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1673 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1674 # u8 *auth_tag, /* Authenticated Tag output. */
1675 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1676 # Valid values are 16 (most likely), 12 or 8. */
1677 ###############################################################################
1678 ENTRY(aesni_gcm_dec_avx_gen2)
1686 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1690 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1694 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1697 ENDPROC(aesni_gcm_dec_avx_gen2)
1698 #endif /* CONFIG_AS_AVX */
1700 #ifdef CONFIG_AS_AVX2
1701 ###############################################################################
1702 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1703 # Input: A and B (128-bits each, bit-reflected)
1704 # Output: C = A*B*x mod poly, (i.e. >>1 )
1705 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1706 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1707 ###############################################################################
1708 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1710 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1711 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1712 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1713 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1717 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1718 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1723 #######################################################################
1724 #first phase of the reduction
1725 vmovdqa POLY2(%rip), \T3
1727 vpclmulqdq $0x01, \GH, \T3, \T2
1728 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1730 vpxor \T2, \GH, \GH # first phase of the reduction complete
1731 #######################################################################
1732 #second phase of the reduction
1733 vpclmulqdq $0x00, \GH, \T3, \T2
1734 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1736 vpclmulqdq $0x10, \GH, \T3, \GH
1737 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1739 vpxor \T2, \GH, \GH # second phase of the reduction complete
1740 #######################################################################
1741 vpxor \T1, \GH, \GH # the result is in GH
1746 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1748 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1750 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1751 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1753 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1754 vmovdqu \T5, HashKey_3(arg2)
1756 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1757 vmovdqu \T5, HashKey_4(arg2)
1759 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1760 vmovdqu \T5, HashKey_5(arg2)
1762 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1763 vmovdqu \T5, HashKey_6(arg2)
1765 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1766 vmovdqu \T5, HashKey_7(arg2)
1768 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1769 vmovdqu \T5, HashKey_8(arg2)
1773 ## if a = number of total plaintext bytes
1775 ## num_initial_blocks = b mod 4#
1776 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1777 ## r10, r11, r12, rax are clobbered
1778 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1780 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1781 i = (8-\num_initial_blocks)
1783 vmovdqu AadHash(arg2), reg_i
1785 # initialize the data pointer offset as zero
1788 # start AES for num_initial_blocks blocks
1789 vmovdqu CurCount(arg2), \CTR
1791 i = (9-\num_initial_blocks)
1793 .rep \num_initial_blocks
1794 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1796 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1801 vmovdqa (arg1), \T_key
1802 i = (9-\num_initial_blocks)
1804 .rep \num_initial_blocks
1805 vpxor \T_key, reg_i, reg_i
1813 vmovdqa 16*j(arg1), \T_key
1814 i = (9-\num_initial_blocks)
1816 .rep \num_initial_blocks
1817 vaesenc \T_key, reg_i, reg_i
1827 vmovdqa 16*j(arg1), \T_key
1828 i = (9-\num_initial_blocks)
1830 .rep \num_initial_blocks
1831 vaesenclast \T_key, reg_i, reg_i
1836 i = (9-\num_initial_blocks)
1838 .rep \num_initial_blocks
1839 vmovdqu (arg4, %r11), \T1
1840 vpxor \T1, reg_i, reg_i
1841 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1842 # num_initial_blocks blocks
1847 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1853 i = (8-\num_initial_blocks)
1854 j = (9-\num_initial_blocks)
1857 .rep \num_initial_blocks
1858 vpxor reg_i, reg_j, reg_j
1859 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1864 # XMM8 has the combined result here
1866 vmovdqa \XMM8, TMP1(%rsp)
1870 jl _initial_blocks_done\@ # no need for precomputed constants
1872 ###############################################################################
1873 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1874 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1876 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1878 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1880 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1882 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1884 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1886 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1888 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1890 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1892 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1894 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1896 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1898 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1900 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1902 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1904 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1906 vmovdqa (arg1), \T_key
1907 vpxor \T_key, \XMM1, \XMM1
1908 vpxor \T_key, \XMM2, \XMM2
1909 vpxor \T_key, \XMM3, \XMM3
1910 vpxor \T_key, \XMM4, \XMM4
1911 vpxor \T_key, \XMM5, \XMM5
1912 vpxor \T_key, \XMM6, \XMM6
1913 vpxor \T_key, \XMM7, \XMM7
1914 vpxor \T_key, \XMM8, \XMM8
1918 .rep \REP # do REP rounds
1919 vmovdqa 16*i(arg1), \T_key
1920 vaesenc \T_key, \XMM1, \XMM1
1921 vaesenc \T_key, \XMM2, \XMM2
1922 vaesenc \T_key, \XMM3, \XMM3
1923 vaesenc \T_key, \XMM4, \XMM4
1924 vaesenc \T_key, \XMM5, \XMM5
1925 vaesenc \T_key, \XMM6, \XMM6
1926 vaesenc \T_key, \XMM7, \XMM7
1927 vaesenc \T_key, \XMM8, \XMM8
1933 vmovdqa 16*i(arg1), \T_key
1934 vaesenclast \T_key, \XMM1, \XMM1
1935 vaesenclast \T_key, \XMM2, \XMM2
1936 vaesenclast \T_key, \XMM3, \XMM3
1937 vaesenclast \T_key, \XMM4, \XMM4
1938 vaesenclast \T_key, \XMM5, \XMM5
1939 vaesenclast \T_key, \XMM6, \XMM6
1940 vaesenclast \T_key, \XMM7, \XMM7
1941 vaesenclast \T_key, \XMM8, \XMM8
1943 vmovdqu (arg4, %r11), \T1
1944 vpxor \T1, \XMM1, \XMM1
1945 vmovdqu \XMM1, (arg3 , %r11)
1950 vmovdqu 16*1(arg4, %r11), \T1
1951 vpxor \T1, \XMM2, \XMM2
1952 vmovdqu \XMM2, 16*1(arg3 , %r11)
1957 vmovdqu 16*2(arg4, %r11), \T1
1958 vpxor \T1, \XMM3, \XMM3
1959 vmovdqu \XMM3, 16*2(arg3 , %r11)
1964 vmovdqu 16*3(arg4, %r11), \T1
1965 vpxor \T1, \XMM4, \XMM4
1966 vmovdqu \XMM4, 16*3(arg3 , %r11)
1971 vmovdqu 16*4(arg4, %r11), \T1
1972 vpxor \T1, \XMM5, \XMM5
1973 vmovdqu \XMM5, 16*4(arg3 , %r11)
1978 vmovdqu 16*5(arg4, %r11), \T1
1979 vpxor \T1, \XMM6, \XMM6
1980 vmovdqu \XMM6, 16*5(arg3 , %r11)
1985 vmovdqu 16*6(arg4, %r11), \T1
1986 vpxor \T1, \XMM7, \XMM7
1987 vmovdqu \XMM7, 16*6(arg3 , %r11)
1992 vmovdqu 16*7(arg4, %r11), \T1
1993 vpxor \T1, \XMM8, \XMM8
1994 vmovdqu \XMM8, 16*7(arg3 , %r11)
2001 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2002 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2003 # the corresponding ciphertext
2004 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2005 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2006 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2007 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2008 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2009 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2010 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2012 ###############################################################################
2014 _initial_blocks_done\@:
2021 # encrypt 8 blocks at a time
2022 # ghash the 8 previously encrypted ciphertext blocks
2023 # arg1, arg3, arg4 are used as pointers only, not modified
2024 # r11 is the data offset value
2025 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2028 vmovdqa \XMM2, TMP2(%rsp)
2029 vmovdqa \XMM3, TMP3(%rsp)
2030 vmovdqa \XMM4, TMP4(%rsp)
2031 vmovdqa \XMM5, TMP5(%rsp)
2032 vmovdqa \XMM6, TMP6(%rsp)
2033 vmovdqa \XMM7, TMP7(%rsp)
2034 vmovdqa \XMM8, TMP8(%rsp)
2036 .if \loop_idx == in_order
2037 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2038 vpaddd ONE(%rip), \XMM1, \XMM2
2039 vpaddd ONE(%rip), \XMM2, \XMM3
2040 vpaddd ONE(%rip), \XMM3, \XMM4
2041 vpaddd ONE(%rip), \XMM4, \XMM5
2042 vpaddd ONE(%rip), \XMM5, \XMM6
2043 vpaddd ONE(%rip), \XMM6, \XMM7
2044 vpaddd ONE(%rip), \XMM7, \XMM8
2047 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2048 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2049 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2050 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2051 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2052 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2053 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2054 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2056 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2057 vpaddd ONEf(%rip), \XMM1, \XMM2
2058 vpaddd ONEf(%rip), \XMM2, \XMM3
2059 vpaddd ONEf(%rip), \XMM3, \XMM4
2060 vpaddd ONEf(%rip), \XMM4, \XMM5
2061 vpaddd ONEf(%rip), \XMM5, \XMM6
2062 vpaddd ONEf(%rip), \XMM6, \XMM7
2063 vpaddd ONEf(%rip), \XMM7, \XMM8
2068 #######################################################################
2071 vpxor \T1, \XMM1, \XMM1
2072 vpxor \T1, \XMM2, \XMM2
2073 vpxor \T1, \XMM3, \XMM3
2074 vpxor \T1, \XMM4, \XMM4
2075 vpxor \T1, \XMM5, \XMM5
2076 vpxor \T1, \XMM6, \XMM6
2077 vpxor \T1, \XMM7, \XMM7
2078 vpxor \T1, \XMM8, \XMM8
2080 #######################################################################
2086 vmovdqu 16*1(arg1), \T1
2087 vaesenc \T1, \XMM1, \XMM1
2088 vaesenc \T1, \XMM2, \XMM2
2089 vaesenc \T1, \XMM3, \XMM3
2090 vaesenc \T1, \XMM4, \XMM4
2091 vaesenc \T1, \XMM5, \XMM5
2092 vaesenc \T1, \XMM6, \XMM6
2093 vaesenc \T1, \XMM7, \XMM7
2094 vaesenc \T1, \XMM8, \XMM8
2096 vmovdqu 16*2(arg1), \T1
2097 vaesenc \T1, \XMM1, \XMM1
2098 vaesenc \T1, \XMM2, \XMM2
2099 vaesenc \T1, \XMM3, \XMM3
2100 vaesenc \T1, \XMM4, \XMM4
2101 vaesenc \T1, \XMM5, \XMM5
2102 vaesenc \T1, \XMM6, \XMM6
2103 vaesenc \T1, \XMM7, \XMM7
2104 vaesenc \T1, \XMM8, \XMM8
2107 #######################################################################
2109 vmovdqu HashKey_8(arg2), \T5
2110 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2111 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2112 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2113 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2116 vmovdqu 16*3(arg1), \T1
2117 vaesenc \T1, \XMM1, \XMM1
2118 vaesenc \T1, \XMM2, \XMM2
2119 vaesenc \T1, \XMM3, \XMM3
2120 vaesenc \T1, \XMM4, \XMM4
2121 vaesenc \T1, \XMM5, \XMM5
2122 vaesenc \T1, \XMM6, \XMM6
2123 vaesenc \T1, \XMM7, \XMM7
2124 vaesenc \T1, \XMM8, \XMM8
2126 vmovdqa TMP2(%rsp), \T1
2127 vmovdqu HashKey_7(arg2), \T5
2128 vpclmulqdq $0x11, \T5, \T1, \T3
2131 vpclmulqdq $0x00, \T5, \T1, \T3
2134 vpclmulqdq $0x01, \T5, \T1, \T3
2137 vpclmulqdq $0x10, \T5, \T1, \T3
2140 vmovdqu 16*4(arg1), \T1
2141 vaesenc \T1, \XMM1, \XMM1
2142 vaesenc \T1, \XMM2, \XMM2
2143 vaesenc \T1, \XMM3, \XMM3
2144 vaesenc \T1, \XMM4, \XMM4
2145 vaesenc \T1, \XMM5, \XMM5
2146 vaesenc \T1, \XMM6, \XMM6
2147 vaesenc \T1, \XMM7, \XMM7
2148 vaesenc \T1, \XMM8, \XMM8
2150 #######################################################################
2152 vmovdqa TMP3(%rsp), \T1
2153 vmovdqu HashKey_6(arg2), \T5
2154 vpclmulqdq $0x11, \T5, \T1, \T3
2157 vpclmulqdq $0x00, \T5, \T1, \T3
2160 vpclmulqdq $0x01, \T5, \T1, \T3
2163 vpclmulqdq $0x10, \T5, \T1, \T3
2166 vmovdqu 16*5(arg1), \T1
2167 vaesenc \T1, \XMM1, \XMM1
2168 vaesenc \T1, \XMM2, \XMM2
2169 vaesenc \T1, \XMM3, \XMM3
2170 vaesenc \T1, \XMM4, \XMM4
2171 vaesenc \T1, \XMM5, \XMM5
2172 vaesenc \T1, \XMM6, \XMM6
2173 vaesenc \T1, \XMM7, \XMM7
2174 vaesenc \T1, \XMM8, \XMM8
2176 vmovdqa TMP4(%rsp), \T1
2177 vmovdqu HashKey_5(arg2), \T5
2178 vpclmulqdq $0x11, \T5, \T1, \T3
2181 vpclmulqdq $0x00, \T5, \T1, \T3
2184 vpclmulqdq $0x01, \T5, \T1, \T3
2187 vpclmulqdq $0x10, \T5, \T1, \T3
2190 vmovdqu 16*6(arg1), \T1
2191 vaesenc \T1, \XMM1, \XMM1
2192 vaesenc \T1, \XMM2, \XMM2
2193 vaesenc \T1, \XMM3, \XMM3
2194 vaesenc \T1, \XMM4, \XMM4
2195 vaesenc \T1, \XMM5, \XMM5
2196 vaesenc \T1, \XMM6, \XMM6
2197 vaesenc \T1, \XMM7, \XMM7
2198 vaesenc \T1, \XMM8, \XMM8
2201 vmovdqa TMP5(%rsp), \T1
2202 vmovdqu HashKey_4(arg2), \T5
2203 vpclmulqdq $0x11, \T5, \T1, \T3
2206 vpclmulqdq $0x00, \T5, \T1, \T3
2209 vpclmulqdq $0x01, \T5, \T1, \T3
2212 vpclmulqdq $0x10, \T5, \T1, \T3
2215 vmovdqu 16*7(arg1), \T1
2216 vaesenc \T1, \XMM1, \XMM1
2217 vaesenc \T1, \XMM2, \XMM2
2218 vaesenc \T1, \XMM3, \XMM3
2219 vaesenc \T1, \XMM4, \XMM4
2220 vaesenc \T1, \XMM5, \XMM5
2221 vaesenc \T1, \XMM6, \XMM6
2222 vaesenc \T1, \XMM7, \XMM7
2223 vaesenc \T1, \XMM8, \XMM8
2225 vmovdqa TMP6(%rsp), \T1
2226 vmovdqu HashKey_3(arg2), \T5
2227 vpclmulqdq $0x11, \T5, \T1, \T3
2230 vpclmulqdq $0x00, \T5, \T1, \T3
2233 vpclmulqdq $0x01, \T5, \T1, \T3
2236 vpclmulqdq $0x10, \T5, \T1, \T3
2239 vmovdqu 16*8(arg1), \T1
2240 vaesenc \T1, \XMM1, \XMM1
2241 vaesenc \T1, \XMM2, \XMM2
2242 vaesenc \T1, \XMM3, \XMM3
2243 vaesenc \T1, \XMM4, \XMM4
2244 vaesenc \T1, \XMM5, \XMM5
2245 vaesenc \T1, \XMM6, \XMM6
2246 vaesenc \T1, \XMM7, \XMM7
2247 vaesenc \T1, \XMM8, \XMM8
2249 vmovdqa TMP7(%rsp), \T1
2250 vmovdqu HashKey_2(arg2), \T5
2251 vpclmulqdq $0x11, \T5, \T1, \T3
2254 vpclmulqdq $0x00, \T5, \T1, \T3
2257 vpclmulqdq $0x01, \T5, \T1, \T3
2260 vpclmulqdq $0x10, \T5, \T1, \T3
2264 #######################################################################
2266 vmovdqu 16*9(arg1), \T5
2267 vaesenc \T5, \XMM1, \XMM1
2268 vaesenc \T5, \XMM2, \XMM2
2269 vaesenc \T5, \XMM3, \XMM3
2270 vaesenc \T5, \XMM4, \XMM4
2271 vaesenc \T5, \XMM5, \XMM5
2272 vaesenc \T5, \XMM6, \XMM6
2273 vaesenc \T5, \XMM7, \XMM7
2274 vaesenc \T5, \XMM8, \XMM8
2276 vmovdqa TMP8(%rsp), \T1
2277 vmovdqu HashKey(arg2), \T5
2279 vpclmulqdq $0x00, \T5, \T1, \T3
2282 vpclmulqdq $0x01, \T5, \T1, \T3
2285 vpclmulqdq $0x10, \T5, \T1, \T3
2288 vpclmulqdq $0x11, \T5, \T1, \T3
2292 vmovdqu 16*10(arg1), \T5
2297 vaesenc \T5, \XMM1, \XMM1
2298 vaesenc \T5, \XMM2, \XMM2
2299 vaesenc \T5, \XMM3, \XMM3
2300 vaesenc \T5, \XMM4, \XMM4
2301 vaesenc \T5, \XMM5, \XMM5
2302 vaesenc \T5, \XMM6, \XMM6
2303 vaesenc \T5, \XMM7, \XMM7
2304 vaesenc \T5, \XMM8, \XMM8
2306 vmovdqu 16*i(arg1), \T5
2315 vpxor 16*i(arg4, %r11), \T5, \T2
2317 vaesenclast \T2, reg_j, reg_j
2319 vaesenclast \T2, reg_j, \T3
2320 vmovdqu 16*i(arg4, %r11), reg_j
2321 vmovdqu \T3, 16*i(arg3, %r11)
2327 #######################################################################
2330 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2331 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2333 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2337 #######################################################################
2338 #first phase of the reduction
2339 vmovdqa POLY2(%rip), \T3
2341 vpclmulqdq $0x01, \T7, \T3, \T2
2342 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2344 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2345 #######################################################################
2347 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2348 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2349 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2350 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2351 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2352 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2353 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2354 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2357 #######################################################################
2358 #second phase of the reduction
2359 vpclmulqdq $0x00, \T7, \T3, \T2
2360 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2362 vpclmulqdq $0x10, \T7, \T3, \T4
2363 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2365 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2366 #######################################################################
2367 vpxor \T4, \T1, \T1 # the result is in T1
2369 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2370 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2371 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2372 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2373 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2374 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2375 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2376 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2379 vpxor \T1, \XMM1, \XMM1
2386 # GHASH the last 4 ciphertext blocks.
2387 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2391 vmovdqu HashKey_8(arg2), \T5
2393 vpshufd $0b01001110, \XMM1, \T2
2394 vpshufd $0b01001110, \T5, \T3
2395 vpxor \XMM1, \T2, \T2
2398 vpclmulqdq $0x11, \T5, \XMM1, \T6
2399 vpclmulqdq $0x00, \T5, \XMM1, \T7
2401 vpclmulqdq $0x00, \T3, \T2, \XMM1
2403 ######################
2405 vmovdqu HashKey_7(arg2), \T5
2406 vpshufd $0b01001110, \XMM2, \T2
2407 vpshufd $0b01001110, \T5, \T3
2408 vpxor \XMM2, \T2, \T2
2411 vpclmulqdq $0x11, \T5, \XMM2, \T4
2414 vpclmulqdq $0x00, \T5, \XMM2, \T4
2417 vpclmulqdq $0x00, \T3, \T2, \T2
2419 vpxor \T2, \XMM1, \XMM1
2421 ######################
2423 vmovdqu HashKey_6(arg2), \T5
2424 vpshufd $0b01001110, \XMM3, \T2
2425 vpshufd $0b01001110, \T5, \T3
2426 vpxor \XMM3, \T2, \T2
2429 vpclmulqdq $0x11, \T5, \XMM3, \T4
2432 vpclmulqdq $0x00, \T5, \XMM3, \T4
2435 vpclmulqdq $0x00, \T3, \T2, \T2
2437 vpxor \T2, \XMM1, \XMM1
2439 ######################
2441 vmovdqu HashKey_5(arg2), \T5
2442 vpshufd $0b01001110, \XMM4, \T2
2443 vpshufd $0b01001110, \T5, \T3
2444 vpxor \XMM4, \T2, \T2
2447 vpclmulqdq $0x11, \T5, \XMM4, \T4
2450 vpclmulqdq $0x00, \T5, \XMM4, \T4
2453 vpclmulqdq $0x00, \T3, \T2, \T2
2455 vpxor \T2, \XMM1, \XMM1
2457 ######################
2459 vmovdqu HashKey_4(arg2), \T5
2460 vpshufd $0b01001110, \XMM5, \T2
2461 vpshufd $0b01001110, \T5, \T3
2462 vpxor \XMM5, \T2, \T2
2465 vpclmulqdq $0x11, \T5, \XMM5, \T4
2468 vpclmulqdq $0x00, \T5, \XMM5, \T4
2471 vpclmulqdq $0x00, \T3, \T2, \T2
2473 vpxor \T2, \XMM1, \XMM1
2475 ######################
2477 vmovdqu HashKey_3(arg2), \T5
2478 vpshufd $0b01001110, \XMM6, \T2
2479 vpshufd $0b01001110, \T5, \T3
2480 vpxor \XMM6, \T2, \T2
2483 vpclmulqdq $0x11, \T5, \XMM6, \T4
2486 vpclmulqdq $0x00, \T5, \XMM6, \T4
2489 vpclmulqdq $0x00, \T3, \T2, \T2
2491 vpxor \T2, \XMM1, \XMM1
2493 ######################
2495 vmovdqu HashKey_2(arg2), \T5
2496 vpshufd $0b01001110, \XMM7, \T2
2497 vpshufd $0b01001110, \T5, \T3
2498 vpxor \XMM7, \T2, \T2
2501 vpclmulqdq $0x11, \T5, \XMM7, \T4
2504 vpclmulqdq $0x00, \T5, \XMM7, \T4
2507 vpclmulqdq $0x00, \T3, \T2, \T2
2509 vpxor \T2, \XMM1, \XMM1
2511 ######################
2513 vmovdqu HashKey(arg2), \T5
2514 vpshufd $0b01001110, \XMM8, \T2
2515 vpshufd $0b01001110, \T5, \T3
2516 vpxor \XMM8, \T2, \T2
2519 vpclmulqdq $0x11, \T5, \XMM8, \T4
2522 vpclmulqdq $0x00, \T5, \XMM8, \T4
2525 vpclmulqdq $0x00, \T3, \T2, \T2
2527 vpxor \T2, \XMM1, \XMM1
2528 vpxor \T6, \XMM1, \XMM1
2529 vpxor \T7, \XMM1, \T2
2534 vpslldq $8, \T2, \T4
2535 vpsrldq $8, \T2, \T2
2538 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2539 # accumulated carry-less multiplications
2541 #######################################################################
2542 #first phase of the reduction
2543 vmovdqa POLY2(%rip), \T3
2545 vpclmulqdq $0x01, \T7, \T3, \T2
2546 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2548 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2549 #######################################################################
2552 #second phase of the reduction
2553 vpclmulqdq $0x00, \T7, \T3, \T2
2554 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2556 vpclmulqdq $0x10, \T7, \T3, \T4
2557 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2559 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2560 #######################################################################
2561 vpxor \T4, \T6, \T6 # the result is in T6
2566 #############################################################
2567 #void aesni_gcm_precomp_avx_gen4
2568 # (gcm_data *my_ctx_data,
2569 # gcm_context_data *data,
2570 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2571 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2572 # (from Security Association) concatenated with 8 byte
2573 # Initialisation Vector (from IPSec ESP Payload)
2574 # concatenated with 0x00000001. 16-byte aligned pointer. */
2575 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2576 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2577 #############################################################
2578 ENTRY(aesni_gcm_precomp_avx_gen4)
2580 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2583 ENDPROC(aesni_gcm_precomp_avx_gen4)
2586 ###############################################################################
2587 #void aesni_gcm_enc_avx_gen4(
2588 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2589 # gcm_context_data *data,
2590 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2591 # const u8 *in, /* Plaintext input */
2592 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2593 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2594 # (from Security Association) concatenated with 8 byte
2595 # Initialisation Vector (from IPSec ESP Payload)
2596 # concatenated with 0x00000001. 16-byte aligned pointer. */
2597 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2598 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2599 # u8 *auth_tag, /* Authenticated Tag output. */
2600 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2601 # Valid values are 16 (most likely), 12 or 8. */
2602 ###############################################################################
2603 ENTRY(aesni_gcm_enc_avx_gen4)
2611 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2615 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2619 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2622 ENDPROC(aesni_gcm_enc_avx_gen4)
2624 ###############################################################################
2625 #void aesni_gcm_dec_avx_gen4(
2626 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2627 # gcm_context_data *data,
2628 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2629 # const u8 *in, /* Ciphertext input */
2630 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2631 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2632 # (from Security Association) concatenated with 8 byte
2633 # Initialisation Vector (from IPSec ESP Payload)
2634 # concatenated with 0x00000001. 16-byte aligned pointer. */
2635 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2636 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2637 # u8 *auth_tag, /* Authenticated Tag output. */
2638 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2639 # Valid values are 16 (most likely), 12 or 8. */
2640 ###############################################################################
2641 ENTRY(aesni_gcm_dec_avx_gen4)
2649 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2653 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2657 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2660 ENDPROC(aesni_gcm_dec_avx_gen4)
2662 #endif /* CONFIG_AS_AVX2 */