1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
185 HashKey = 16*6 # store HashKey <<1 mod poly here
186 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
187 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
188 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
189 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
190 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
191 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
192 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
193 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
194 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
195 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
196 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
197 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
198 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
199 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
200 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208 #define arg7 STACK_OFFSET+8*1(%r14)
209 #define arg8 STACK_OFFSET+8*2(%r14)
210 #define arg9 STACK_OFFSET+8*3(%r14)
211 #define arg10 STACK_OFFSET+8*4(%r14)
221 .macro define_reg r n
232 # need to push 4 registers into stack to maintain
235 TMP1 = 16*0 # Temporary storage for AAD
236 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
237 TMP3 = 16*2 # Temporary storage for AES State 3
238 TMP4 = 16*3 # Temporary storage for AES State 4
239 TMP5 = 16*4 # Temporary storage for AES State 5
240 TMP6 = 16*5 # Temporary storage for AES State 6
241 TMP7 = 16*6 # Temporary storage for AES State 7
242 TMP8 = 16*7 # Temporary storage for AES State 8
244 VARIABLE_OFFSET = 16*8
246 ################################
248 ################################
251 #the number of pushes must equal STACK_OFFSET
261 sub $VARIABLE_OFFSET, %rsp
262 and $~63, %rsp # align rsp to 64 bytes
274 # Encryption of a single block
275 .macro ENCRYPT_SINGLE_BLOCK XMM0
276 vpxor (arg1), \XMM0, \XMM0
280 vaesenc 16*i(arg1), \XMM0, \XMM0
284 vaesenclast 16*10(arg1), \XMM0, \XMM0
287 # combined for GCM encrypt and decrypt functions
288 # clobbering all xmm registers
289 # clobbering r10, r11, r12, r13, r14, r15
290 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC
291 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
293 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
294 and $-16, %r13 # r13 = r13 - (r13 mod 16)
299 jz _initial_num_blocks_is_0\@
302 je _initial_num_blocks_is_7\@
304 je _initial_num_blocks_is_6\@
306 je _initial_num_blocks_is_5\@
308 je _initial_num_blocks_is_4\@
310 je _initial_num_blocks_is_3\@
312 je _initial_num_blocks_is_2\@
314 jmp _initial_num_blocks_is_1\@
316 _initial_num_blocks_is_7\@:
317 \INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
319 jmp _initial_blocks_encrypted\@
321 _initial_num_blocks_is_6\@:
322 \INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
324 jmp _initial_blocks_encrypted\@
326 _initial_num_blocks_is_5\@:
327 \INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
329 jmp _initial_blocks_encrypted\@
331 _initial_num_blocks_is_4\@:
332 \INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334 jmp _initial_blocks_encrypted\@
336 _initial_num_blocks_is_3\@:
337 \INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339 jmp _initial_blocks_encrypted\@
341 _initial_num_blocks_is_2\@:
342 \INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344 jmp _initial_blocks_encrypted\@
346 _initial_num_blocks_is_1\@:
347 \INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349 jmp _initial_blocks_encrypted\@
351 _initial_num_blocks_is_0\@:
352 \INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355 _initial_blocks_encrypted\@:
357 je _zero_cipher_left\@
360 je _eight_cipher_left\@
367 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
377 \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
380 jne _encrypt_by_8_new\@
382 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
383 jmp _eight_cipher_left\@
386 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
388 \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
389 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
392 jne _encrypt_by_8_new\@
394 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399 _eight_cipher_left\@:
400 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
405 jl _only_less_than_16\@
408 and $15, %r13 # r13 = (arg5 mod 16)
410 je _multiple_of_16_bytes\@
412 # handle the last <16 Byte block seperately
415 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
416 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
417 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
421 vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block
423 lea SHIFT_MASK+16(%rip), %r12
424 sub %r13, %r12 # adjust the shuffle mask pointer to be
425 # able to shift 16-r13 bytes (r13 is the
426 # number of bytes in plaintext mod 16)
427 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
428 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
429 jmp _final_ghash_mul\@
431 _only_less_than_16\@:
434 and $15, %r13 # r13 = (arg5 mod 16)
436 je _multiple_of_16_bytes\@
438 # handle the last <16 Byte block separately
441 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
442 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
443 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
446 lea SHIFT_MASK+16(%rip), %r12
447 sub %r13, %r12 # adjust the shuffle mask pointer to be
448 # able to shift 16-r13 bytes (r13 is the
449 # number of bytes in plaintext mod 16)
451 _get_last_16_byte_loop\@:
452 movb (arg4, %r11), %al
453 movb %al, TMP1 (%rsp , %r11)
456 jne _get_last_16_byte_loop\@
458 vmovdqu TMP1(%rsp), %xmm1
465 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
466 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
467 # mask out top 16-r13 bytes of xmm9
468 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
469 vpand %xmm1, %xmm2, %xmm2
470 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
471 vpxor %xmm2, %xmm14, %xmm14
472 #GHASH computation for the last <16 Byte block
473 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
482 vpxor %xmm9, %xmm14, %xmm14
483 #GHASH computation for the last <16 Byte block
484 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
487 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
491 #############################
495 jle _less_than_8_bytes_left\@
497 mov %rax, (arg3 , %r11)
499 vpsrldq $8, %xmm9, %xmm9
503 _less_than_8_bytes_left\@:
504 movb %al, (arg3 , %r11)
508 jne _less_than_8_bytes_left\@
509 #############################
511 _multiple_of_16_bytes\@:
512 mov arg8, %r12 # r12 = aadLen (number of bytes)
513 shl $3, %r12 # convert into number of bits
514 vmovd %r12d, %xmm15 # len(A) in xmm15
516 shl $3, arg5 # len(C) in bits (*128)
518 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
519 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
521 vpxor %xmm15, %xmm14, %xmm14
522 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
523 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
525 mov arg6, %rax # rax = *Y0
526 vmovdqu (%rax), %xmm9 # xmm9 = Y0
528 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
530 vpxor %xmm14, %xmm9, %xmm9
535 mov arg9, %r10 # r10 = authTag
536 mov arg10, %r11 # r11 = auth_tag_len
549 vpsrldq $8, %xmm9, %xmm9
557 vpsrldq $4, %xmm9, %xmm9
574 vmovdqu %xmm9, (%r10)
580 ###############################################################################
581 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
582 # Input: A and B (128-bits each, bit-reflected)
583 # Output: C = A*B*x mod poly, (i.e. >>1 )
584 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
585 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
586 ###############################################################################
587 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
589 vpshufd $0b01001110, \GH, \T2
590 vpshufd $0b01001110, \HK, \T3
591 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
592 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
594 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
595 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
596 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
598 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
600 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
601 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
603 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
605 #first phase of the reduction
606 vpslld $31, \GH, \T2 # packed right shifting << 31
607 vpslld $30, \GH, \T3 # packed right shifting shift << 30
608 vpslld $25, \GH, \T4 # packed right shifting shift << 25
610 vpxor \T3, \T2, \T2 # xor the shifted versions
613 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
615 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
616 vpxor \T2, \GH, \GH # first phase of the reduction complete
618 #second phase of the reduction
620 vpsrld $1,\GH, \T2 # packed left shifting >> 1
621 vpsrld $2,\GH, \T3 # packed left shifting >> 2
622 vpsrld $7,\GH, \T4 # packed left shifting >> 7
623 vpxor \T3, \T2, \T2 # xor the shifted versions
628 vpxor \T1, \GH, \GH # the result is in GH
633 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
635 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
638 vpshufd $0b01001110, \T5, \T1
640 vmovdqu \T1, HashKey_k(arg2)
642 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
643 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
644 vpshufd $0b01001110, \T5, \T1
646 vmovdqu \T1, HashKey_2_k(arg2)
648 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
649 vmovdqu \T5, HashKey_3(arg2)
650 vpshufd $0b01001110, \T5, \T1
652 vmovdqu \T1, HashKey_3_k(arg2)
654 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
655 vmovdqu \T5, HashKey_4(arg2)
656 vpshufd $0b01001110, \T5, \T1
658 vmovdqu \T1, HashKey_4_k(arg2)
660 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
661 vmovdqu \T5, HashKey_5(arg2)
662 vpshufd $0b01001110, \T5, \T1
664 vmovdqu \T1, HashKey_5_k(arg2)
666 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
667 vmovdqu \T5, HashKey_6(arg2)
668 vpshufd $0b01001110, \T5, \T1
670 vmovdqu \T1, HashKey_6_k(arg2)
672 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
673 vmovdqu \T5, HashKey_7(arg2)
674 vpshufd $0b01001110, \T5, \T1
676 vmovdqu \T1, HashKey_7_k(arg2)
678 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
679 vmovdqu \T5, HashKey_8(arg2)
680 vpshufd $0b01001110, \T5, \T1
682 vmovdqu \T1, HashKey_8_k(arg2)
686 ## if a = number of total plaintext bytes
688 ## num_initial_blocks = b mod 4#
689 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
690 ## r10, r11, r12, rax are clobbered
691 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
693 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
694 i = (8-\num_initial_blocks)
698 mov arg7, %r10 # r10 = AAD
699 mov arg8, %r12 # r12 = aadLen
704 vpxor reg_j, reg_j, reg_j
705 vpxor reg_i, reg_i, reg_i
709 vmovdqu (%r10), reg_i
710 vpshufb SHUF_MASK(%rip), reg_i, reg_i
711 vpxor reg_i, reg_j, reg_j
712 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
717 jge _get_AAD_blocks\@
722 vpxor reg_i, reg_i, reg_i
724 /* read the last <16B of AAD. since we have at least 4B of
725 data right after the AAD (the ICV, and maybe some CT), we can
726 read 4B/8B blocks safely, and then get rid of the extra stuff */
734 vpsrldq $8, reg_i, reg_i
735 vpxor \T1, reg_i, reg_i
744 vpslldq $12, \T1, \T1
745 vpsrldq $4, reg_i, reg_i
746 vpxor \T1, reg_i, reg_i
748 /* finalize: shift out the extra bytes we read, and align
749 left. since pslldq can only shift by an immediate, we use
750 vpshufb and an array of shuffle masks */
753 movdqu aad_shift_arr(%r11), \T1
754 vpshufb \T1, reg_i, reg_i
755 _get_AAD_rest_final\@:
756 vpshufb SHUF_MASK(%rip), reg_i, reg_i
757 vpxor reg_j, reg_i, reg_i
758 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
761 # initialize the data pointer offset as zero
764 # start AES for num_initial_blocks blocks
765 mov arg6, %rax # rax = *Y0
766 vmovdqu (%rax), \CTR # CTR = Y0
767 vpshufb SHUF_MASK(%rip), \CTR, \CTR
770 i = (9-\num_initial_blocks)
772 .rep \num_initial_blocks
773 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
775 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
780 vmovdqa (arg1), \T_key
781 i = (9-\num_initial_blocks)
783 .rep \num_initial_blocks
784 vpxor \T_key, reg_i, reg_i
792 vmovdqa 16*j(arg1), \T_key
793 i = (9-\num_initial_blocks)
795 .rep \num_initial_blocks
796 vaesenc \T_key, reg_i, reg_i
806 vmovdqa 16*10(arg1), \T_key
807 i = (9-\num_initial_blocks)
809 .rep \num_initial_blocks
810 vaesenclast \T_key, reg_i, reg_i
815 i = (9-\num_initial_blocks)
817 .rep \num_initial_blocks
818 vmovdqu (arg4, %r11), \T1
819 vpxor \T1, reg_i, reg_i
820 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
825 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
831 i = (8-\num_initial_blocks)
832 j = (9-\num_initial_blocks)
835 .rep \num_initial_blocks
836 vpxor reg_i, reg_j, reg_j
837 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
842 # XMM8 has the combined result here
844 vmovdqa \XMM8, TMP1(%rsp)
848 jl _initial_blocks_done\@ # no need for precomputed constants
850 ###############################################################################
851 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
852 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
854 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
856 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
858 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
860 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
862 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
864 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
866 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
868 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
870 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
872 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
874 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
876 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
878 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
880 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
882 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
884 vmovdqa (arg1), \T_key
885 vpxor \T_key, \XMM1, \XMM1
886 vpxor \T_key, \XMM2, \XMM2
887 vpxor \T_key, \XMM3, \XMM3
888 vpxor \T_key, \XMM4, \XMM4
889 vpxor \T_key, \XMM5, \XMM5
890 vpxor \T_key, \XMM6, \XMM6
891 vpxor \T_key, \XMM7, \XMM7
892 vpxor \T_key, \XMM8, \XMM8
897 vmovdqa 16*i(arg1), \T_key
898 vaesenc \T_key, \XMM1, \XMM1
899 vaesenc \T_key, \XMM2, \XMM2
900 vaesenc \T_key, \XMM3, \XMM3
901 vaesenc \T_key, \XMM4, \XMM4
902 vaesenc \T_key, \XMM5, \XMM5
903 vaesenc \T_key, \XMM6, \XMM6
904 vaesenc \T_key, \XMM7, \XMM7
905 vaesenc \T_key, \XMM8, \XMM8
911 vmovdqa 16*i(arg1), \T_key
912 vaesenclast \T_key, \XMM1, \XMM1
913 vaesenclast \T_key, \XMM2, \XMM2
914 vaesenclast \T_key, \XMM3, \XMM3
915 vaesenclast \T_key, \XMM4, \XMM4
916 vaesenclast \T_key, \XMM5, \XMM5
917 vaesenclast \T_key, \XMM6, \XMM6
918 vaesenclast \T_key, \XMM7, \XMM7
919 vaesenclast \T_key, \XMM8, \XMM8
921 vmovdqu (arg4, %r11), \T1
922 vpxor \T1, \XMM1, \XMM1
923 vmovdqu \XMM1, (arg3 , %r11)
928 vmovdqu 16*1(arg4, %r11), \T1
929 vpxor \T1, \XMM2, \XMM2
930 vmovdqu \XMM2, 16*1(arg3 , %r11)
935 vmovdqu 16*2(arg4, %r11), \T1
936 vpxor \T1, \XMM3, \XMM3
937 vmovdqu \XMM3, 16*2(arg3 , %r11)
942 vmovdqu 16*3(arg4, %r11), \T1
943 vpxor \T1, \XMM4, \XMM4
944 vmovdqu \XMM4, 16*3(arg3 , %r11)
949 vmovdqu 16*4(arg4, %r11), \T1
950 vpxor \T1, \XMM5, \XMM5
951 vmovdqu \XMM5, 16*4(arg3 , %r11)
956 vmovdqu 16*5(arg4, %r11), \T1
957 vpxor \T1, \XMM6, \XMM6
958 vmovdqu \XMM6, 16*5(arg3 , %r11)
963 vmovdqu 16*6(arg4, %r11), \T1
964 vpxor \T1, \XMM7, \XMM7
965 vmovdqu \XMM7, 16*6(arg3 , %r11)
970 vmovdqu 16*7(arg4, %r11), \T1
971 vpxor \T1, \XMM8, \XMM8
972 vmovdqu \XMM8, 16*7(arg3 , %r11)
979 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
980 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
981 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
982 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
983 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
984 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
985 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
986 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
987 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
989 ###############################################################################
991 _initial_blocks_done\@:
995 # encrypt 8 blocks at a time
996 # ghash the 8 previously encrypted ciphertext blocks
997 # arg1, arg3, arg4 are used as pointers only, not modified
998 # r11 is the data offset value
999 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1002 vmovdqa \XMM2, TMP2(%rsp)
1003 vmovdqa \XMM3, TMP3(%rsp)
1004 vmovdqa \XMM4, TMP4(%rsp)
1005 vmovdqa \XMM5, TMP5(%rsp)
1006 vmovdqa \XMM6, TMP6(%rsp)
1007 vmovdqa \XMM7, TMP7(%rsp)
1008 vmovdqa \XMM8, TMP8(%rsp)
1010 .if \loop_idx == in_order
1011 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1012 vpaddd ONE(%rip), \XMM1, \XMM2
1013 vpaddd ONE(%rip), \XMM2, \XMM3
1014 vpaddd ONE(%rip), \XMM3, \XMM4
1015 vpaddd ONE(%rip), \XMM4, \XMM5
1016 vpaddd ONE(%rip), \XMM5, \XMM6
1017 vpaddd ONE(%rip), \XMM6, \XMM7
1018 vpaddd ONE(%rip), \XMM7, \XMM8
1021 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1022 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1023 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1024 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1025 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1026 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1027 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1028 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1030 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1031 vpaddd ONEf(%rip), \XMM1, \XMM2
1032 vpaddd ONEf(%rip), \XMM2, \XMM3
1033 vpaddd ONEf(%rip), \XMM3, \XMM4
1034 vpaddd ONEf(%rip), \XMM4, \XMM5
1035 vpaddd ONEf(%rip), \XMM5, \XMM6
1036 vpaddd ONEf(%rip), \XMM6, \XMM7
1037 vpaddd ONEf(%rip), \XMM7, \XMM8
1042 #######################################################################
1045 vpxor \T1, \XMM1, \XMM1
1046 vpxor \T1, \XMM2, \XMM2
1047 vpxor \T1, \XMM3, \XMM3
1048 vpxor \T1, \XMM4, \XMM4
1049 vpxor \T1, \XMM5, \XMM5
1050 vpxor \T1, \XMM6, \XMM6
1051 vpxor \T1, \XMM7, \XMM7
1052 vpxor \T1, \XMM8, \XMM8
1054 #######################################################################
1060 vmovdqu 16*1(arg1), \T1
1061 vaesenc \T1, \XMM1, \XMM1
1062 vaesenc \T1, \XMM2, \XMM2
1063 vaesenc \T1, \XMM3, \XMM3
1064 vaesenc \T1, \XMM4, \XMM4
1065 vaesenc \T1, \XMM5, \XMM5
1066 vaesenc \T1, \XMM6, \XMM6
1067 vaesenc \T1, \XMM7, \XMM7
1068 vaesenc \T1, \XMM8, \XMM8
1070 vmovdqu 16*2(arg1), \T1
1071 vaesenc \T1, \XMM1, \XMM1
1072 vaesenc \T1, \XMM2, \XMM2
1073 vaesenc \T1, \XMM3, \XMM3
1074 vaesenc \T1, \XMM4, \XMM4
1075 vaesenc \T1, \XMM5, \XMM5
1076 vaesenc \T1, \XMM6, \XMM6
1077 vaesenc \T1, \XMM7, \XMM7
1078 vaesenc \T1, \XMM8, \XMM8
1081 #######################################################################
1083 vmovdqu HashKey_8(arg2), \T5
1084 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1085 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1087 vpshufd $0b01001110, \T2, \T6
1090 vmovdqu HashKey_8_k(arg2), \T5
1091 vpclmulqdq $0x00, \T5, \T6, \T6
1093 vmovdqu 16*3(arg1), \T1
1094 vaesenc \T1, \XMM1, \XMM1
1095 vaesenc \T1, \XMM2, \XMM2
1096 vaesenc \T1, \XMM3, \XMM3
1097 vaesenc \T1, \XMM4, \XMM4
1098 vaesenc \T1, \XMM5, \XMM5
1099 vaesenc \T1, \XMM6, \XMM6
1100 vaesenc \T1, \XMM7, \XMM7
1101 vaesenc \T1, \XMM8, \XMM8
1103 vmovdqa TMP2(%rsp), \T1
1104 vmovdqu HashKey_7(arg2), \T5
1105 vpclmulqdq $0x11, \T5, \T1, \T3
1107 vpclmulqdq $0x00, \T5, \T1, \T3
1110 vpshufd $0b01001110, \T1, \T3
1112 vmovdqu HashKey_7_k(arg2), \T5
1113 vpclmulqdq $0x10, \T5, \T3, \T3
1116 vmovdqu 16*4(arg1), \T1
1117 vaesenc \T1, \XMM1, \XMM1
1118 vaesenc \T1, \XMM2, \XMM2
1119 vaesenc \T1, \XMM3, \XMM3
1120 vaesenc \T1, \XMM4, \XMM4
1121 vaesenc \T1, \XMM5, \XMM5
1122 vaesenc \T1, \XMM6, \XMM6
1123 vaesenc \T1, \XMM7, \XMM7
1124 vaesenc \T1, \XMM8, \XMM8
1126 #######################################################################
1128 vmovdqa TMP3(%rsp), \T1
1129 vmovdqu HashKey_6(arg2), \T5
1130 vpclmulqdq $0x11, \T5, \T1, \T3
1132 vpclmulqdq $0x00, \T5, \T1, \T3
1135 vpshufd $0b01001110, \T1, \T3
1137 vmovdqu HashKey_6_k(arg2), \T5
1138 vpclmulqdq $0x10, \T5, \T3, \T3
1141 vmovdqu 16*5(arg1), \T1
1142 vaesenc \T1, \XMM1, \XMM1
1143 vaesenc \T1, \XMM2, \XMM2
1144 vaesenc \T1, \XMM3, \XMM3
1145 vaesenc \T1, \XMM4, \XMM4
1146 vaesenc \T1, \XMM5, \XMM5
1147 vaesenc \T1, \XMM6, \XMM6
1148 vaesenc \T1, \XMM7, \XMM7
1149 vaesenc \T1, \XMM8, \XMM8
1151 vmovdqa TMP4(%rsp), \T1
1152 vmovdqu HashKey_5(arg2), \T5
1153 vpclmulqdq $0x11, \T5, \T1, \T3
1155 vpclmulqdq $0x00, \T5, \T1, \T3
1158 vpshufd $0b01001110, \T1, \T3
1160 vmovdqu HashKey_5_k(arg2), \T5
1161 vpclmulqdq $0x10, \T5, \T3, \T3
1164 vmovdqu 16*6(arg1), \T1
1165 vaesenc \T1, \XMM1, \XMM1
1166 vaesenc \T1, \XMM2, \XMM2
1167 vaesenc \T1, \XMM3, \XMM3
1168 vaesenc \T1, \XMM4, \XMM4
1169 vaesenc \T1, \XMM5, \XMM5
1170 vaesenc \T1, \XMM6, \XMM6
1171 vaesenc \T1, \XMM7, \XMM7
1172 vaesenc \T1, \XMM8, \XMM8
1175 vmovdqa TMP5(%rsp), \T1
1176 vmovdqu HashKey_4(arg2), \T5
1177 vpclmulqdq $0x11, \T5, \T1, \T3
1179 vpclmulqdq $0x00, \T5, \T1, \T3
1182 vpshufd $0b01001110, \T1, \T3
1184 vmovdqu HashKey_4_k(arg2), \T5
1185 vpclmulqdq $0x10, \T5, \T3, \T3
1188 vmovdqu 16*7(arg1), \T1
1189 vaesenc \T1, \XMM1, \XMM1
1190 vaesenc \T1, \XMM2, \XMM2
1191 vaesenc \T1, \XMM3, \XMM3
1192 vaesenc \T1, \XMM4, \XMM4
1193 vaesenc \T1, \XMM5, \XMM5
1194 vaesenc \T1, \XMM6, \XMM6
1195 vaesenc \T1, \XMM7, \XMM7
1196 vaesenc \T1, \XMM8, \XMM8
1198 vmovdqa TMP6(%rsp), \T1
1199 vmovdqu HashKey_3(arg2), \T5
1200 vpclmulqdq $0x11, \T5, \T1, \T3
1202 vpclmulqdq $0x00, \T5, \T1, \T3
1205 vpshufd $0b01001110, \T1, \T3
1207 vmovdqu HashKey_3_k(arg2), \T5
1208 vpclmulqdq $0x10, \T5, \T3, \T3
1212 vmovdqu 16*8(arg1), \T1
1213 vaesenc \T1, \XMM1, \XMM1
1214 vaesenc \T1, \XMM2, \XMM2
1215 vaesenc \T1, \XMM3, \XMM3
1216 vaesenc \T1, \XMM4, \XMM4
1217 vaesenc \T1, \XMM5, \XMM5
1218 vaesenc \T1, \XMM6, \XMM6
1219 vaesenc \T1, \XMM7, \XMM7
1220 vaesenc \T1, \XMM8, \XMM8
1222 vmovdqa TMP7(%rsp), \T1
1223 vmovdqu HashKey_2(arg2), \T5
1224 vpclmulqdq $0x11, \T5, \T1, \T3
1226 vpclmulqdq $0x00, \T5, \T1, \T3
1229 vpshufd $0b01001110, \T1, \T3
1231 vmovdqu HashKey_2_k(arg2), \T5
1232 vpclmulqdq $0x10, \T5, \T3, \T3
1235 #######################################################################
1237 vmovdqu 16*9(arg1), \T5
1238 vaesenc \T5, \XMM1, \XMM1
1239 vaesenc \T5, \XMM2, \XMM2
1240 vaesenc \T5, \XMM3, \XMM3
1241 vaesenc \T5, \XMM4, \XMM4
1242 vaesenc \T5, \XMM5, \XMM5
1243 vaesenc \T5, \XMM6, \XMM6
1244 vaesenc \T5, \XMM7, \XMM7
1245 vaesenc \T5, \XMM8, \XMM8
1247 vmovdqa TMP8(%rsp), \T1
1248 vmovdqu HashKey(arg2), \T5
1249 vpclmulqdq $0x11, \T5, \T1, \T3
1251 vpclmulqdq $0x00, \T5, \T1, \T3
1254 vpshufd $0b01001110, \T1, \T3
1256 vmovdqu HashKey_k(arg2), \T5
1257 vpclmulqdq $0x10, \T5, \T3, \T3
1263 vmovdqu 16*10(arg1), \T5
1269 vpxor 16*i(arg4, %r11), \T5, \T2
1271 vaesenclast \T2, reg_j, reg_j
1273 vaesenclast \T2, reg_j, \T3
1274 vmovdqu 16*i(arg4, %r11), reg_j
1275 vmovdqu \T3, 16*i(arg3, %r11)
1281 #######################################################################
1284 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1285 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1287 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1291 #######################################################################
1292 #first phase of the reduction
1293 #######################################################################
1294 vpslld $31, \T7, \T2 # packed right shifting << 31
1295 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1296 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1298 vpxor \T3, \T2, \T2 # xor the shifted versions
1301 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1303 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1304 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1305 #######################################################################
1307 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1308 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1309 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1310 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1311 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1312 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1313 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1314 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1317 #######################################################################
1318 #second phase of the reduction
1319 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1320 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1321 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1322 vpxor \T3, \T2, \T2 # xor the shifted versions
1327 vpxor \T7, \T6, \T6 # the result is in T6
1328 #######################################################################
1330 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1331 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1332 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1333 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1334 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1335 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1336 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1337 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1340 vpxor \T6, \XMM1, \XMM1
1347 # GHASH the last 4 ciphertext blocks.
1348 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1353 vpshufd $0b01001110, \XMM1, \T2
1354 vpxor \XMM1, \T2, \T2
1355 vmovdqu HashKey_8(arg2), \T5
1356 vpclmulqdq $0x11, \T5, \XMM1, \T6
1357 vpclmulqdq $0x00, \T5, \XMM1, \T7
1359 vmovdqu HashKey_8_k(arg2), \T3
1360 vpclmulqdq $0x00, \T3, \T2, \XMM1
1362 ######################
1364 vpshufd $0b01001110, \XMM2, \T2
1365 vpxor \XMM2, \T2, \T2
1366 vmovdqu HashKey_7(arg2), \T5
1367 vpclmulqdq $0x11, \T5, \XMM2, \T4
1370 vpclmulqdq $0x00, \T5, \XMM2, \T4
1373 vmovdqu HashKey_7_k(arg2), \T3
1374 vpclmulqdq $0x00, \T3, \T2, \T2
1375 vpxor \T2, \XMM1, \XMM1
1377 ######################
1379 vpshufd $0b01001110, \XMM3, \T2
1380 vpxor \XMM3, \T2, \T2
1381 vmovdqu HashKey_6(arg2), \T5
1382 vpclmulqdq $0x11, \T5, \XMM3, \T4
1385 vpclmulqdq $0x00, \T5, \XMM3, \T4
1388 vmovdqu HashKey_6_k(arg2), \T3
1389 vpclmulqdq $0x00, \T3, \T2, \T2
1390 vpxor \T2, \XMM1, \XMM1
1392 ######################
1394 vpshufd $0b01001110, \XMM4, \T2
1395 vpxor \XMM4, \T2, \T2
1396 vmovdqu HashKey_5(arg2), \T5
1397 vpclmulqdq $0x11, \T5, \XMM4, \T4
1400 vpclmulqdq $0x00, \T5, \XMM4, \T4
1403 vmovdqu HashKey_5_k(arg2), \T3
1404 vpclmulqdq $0x00, \T3, \T2, \T2
1405 vpxor \T2, \XMM1, \XMM1
1407 ######################
1409 vpshufd $0b01001110, \XMM5, \T2
1410 vpxor \XMM5, \T2, \T2
1411 vmovdqu HashKey_4(arg2), \T5
1412 vpclmulqdq $0x11, \T5, \XMM5, \T4
1415 vpclmulqdq $0x00, \T5, \XMM5, \T4
1418 vmovdqu HashKey_4_k(arg2), \T3
1419 vpclmulqdq $0x00, \T3, \T2, \T2
1420 vpxor \T2, \XMM1, \XMM1
1422 ######################
1424 vpshufd $0b01001110, \XMM6, \T2
1425 vpxor \XMM6, \T2, \T2
1426 vmovdqu HashKey_3(arg2), \T5
1427 vpclmulqdq $0x11, \T5, \XMM6, \T4
1430 vpclmulqdq $0x00, \T5, \XMM6, \T4
1433 vmovdqu HashKey_3_k(arg2), \T3
1434 vpclmulqdq $0x00, \T3, \T2, \T2
1435 vpxor \T2, \XMM1, \XMM1
1437 ######################
1439 vpshufd $0b01001110, \XMM7, \T2
1440 vpxor \XMM7, \T2, \T2
1441 vmovdqu HashKey_2(arg2), \T5
1442 vpclmulqdq $0x11, \T5, \XMM7, \T4
1445 vpclmulqdq $0x00, \T5, \XMM7, \T4
1448 vmovdqu HashKey_2_k(arg2), \T3
1449 vpclmulqdq $0x00, \T3, \T2, \T2
1450 vpxor \T2, \XMM1, \XMM1
1452 ######################
1454 vpshufd $0b01001110, \XMM8, \T2
1455 vpxor \XMM8, \T2, \T2
1456 vmovdqu HashKey(arg2), \T5
1457 vpclmulqdq $0x11, \T5, \XMM8, \T4
1460 vpclmulqdq $0x00, \T5, \XMM8, \T4
1463 vmovdqu HashKey_k(arg2), \T3
1464 vpclmulqdq $0x00, \T3, \T2, \T2
1466 vpxor \T2, \XMM1, \XMM1
1467 vpxor \T6, \XMM1, \XMM1
1468 vpxor \T7, \XMM1, \T2
1473 vpslldq $8, \T2, \T4
1474 vpsrldq $8, \T2, \T2
1477 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1478 # the accumulated carry-less multiplications
1480 #######################################################################
1481 #first phase of the reduction
1482 vpslld $31, \T7, \T2 # packed right shifting << 31
1483 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1484 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1486 vpxor \T3, \T2, \T2 # xor the shifted versions
1489 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1491 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1492 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1493 #######################################################################
1496 #second phase of the reduction
1497 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1498 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1499 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1500 vpxor \T3, \T2, \T2 # xor the shifted versions
1505 vpxor \T7, \T6, \T6 # the result is in T6
1509 #############################################################
1510 #void aesni_gcm_precomp_avx_gen2
1511 # (gcm_data *my_ctx_data,
1512 # gcm_context_data *data,
1513 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1514 #############################################################
1515 ENTRY(aesni_gcm_precomp_avx_gen2)
1518 vmovdqu (arg3), %xmm6 # xmm6 = HashKey
1520 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1521 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1522 vmovdqa %xmm6, %xmm2
1523 vpsllq $1, %xmm6, %xmm6
1524 vpsrlq $63, %xmm2, %xmm2
1525 vmovdqa %xmm2, %xmm1
1526 vpslldq $8, %xmm2, %xmm2
1527 vpsrldq $8, %xmm1, %xmm1
1528 vpor %xmm2, %xmm6, %xmm6
1530 vpshufd $0b00100100, %xmm1, %xmm2
1531 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1532 vpand POLY(%rip), %xmm2, %xmm2
1533 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1534 #######################################################################
1535 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
1538 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1542 ENDPROC(aesni_gcm_precomp_avx_gen2)
1544 ###############################################################################
1545 #void aesni_gcm_enc_avx_gen2(
1546 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1547 # gcm_context_data *data,
1548 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1549 # const u8 *in, /* Plaintext input */
1550 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1551 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1552 # (from Security Association) concatenated with 8 byte
1553 # Initialisation Vector (from IPSec ESP Payload)
1554 # concatenated with 0x00000001. 16-byte aligned pointer. */
1555 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1556 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1557 # u8 *auth_tag, /* Authenticated Tag output. */
1558 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1559 # Valid values are 16 (most likely), 12 or 8. */
1560 ###############################################################################
1561 ENTRY(aesni_gcm_enc_avx_gen2)
1563 GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC
1566 ENDPROC(aesni_gcm_enc_avx_gen2)
1568 ###############################################################################
1569 #void aesni_gcm_dec_avx_gen2(
1570 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1571 # gcm_context_data *data,
1572 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1573 # const u8 *in, /* Ciphertext input */
1574 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1575 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1576 # (from Security Association) concatenated with 8 byte
1577 # Initialisation Vector (from IPSec ESP Payload)
1578 # concatenated with 0x00000001. 16-byte aligned pointer. */
1579 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1580 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1581 # u8 *auth_tag, /* Authenticated Tag output. */
1582 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1583 # Valid values are 16 (most likely), 12 or 8. */
1584 ###############################################################################
1585 ENTRY(aesni_gcm_dec_avx_gen2)
1587 GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC
1590 ENDPROC(aesni_gcm_dec_avx_gen2)
1591 #endif /* CONFIG_AS_AVX */
1593 #ifdef CONFIG_AS_AVX2
1594 ###############################################################################
1595 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1596 # Input: A and B (128-bits each, bit-reflected)
1597 # Output: C = A*B*x mod poly, (i.e. >>1 )
1598 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1599 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1600 ###############################################################################
1601 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1603 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1604 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1605 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1606 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1610 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1611 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1616 #######################################################################
1617 #first phase of the reduction
1618 vmovdqa POLY2(%rip), \T3
1620 vpclmulqdq $0x01, \GH, \T3, \T2
1621 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1623 vpxor \T2, \GH, \GH # first phase of the reduction complete
1624 #######################################################################
1625 #second phase of the reduction
1626 vpclmulqdq $0x00, \GH, \T3, \T2
1627 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1629 vpclmulqdq $0x10, \GH, \T3, \GH
1630 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1632 vpxor \T2, \GH, \GH # second phase of the reduction complete
1633 #######################################################################
1634 vpxor \T1, \GH, \GH # the result is in GH
1639 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1641 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1643 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1644 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1646 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1647 vmovdqu \T5, HashKey_3(arg2)
1649 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1650 vmovdqu \T5, HashKey_4(arg2)
1652 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1653 vmovdqu \T5, HashKey_5(arg2)
1655 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1656 vmovdqu \T5, HashKey_6(arg2)
1658 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1659 vmovdqu \T5, HashKey_7(arg2)
1661 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1662 vmovdqu \T5, HashKey_8(arg2)
1667 ## if a = number of total plaintext bytes
1669 ## num_initial_blocks = b mod 4#
1670 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1671 ## r10, r11, r12, rax are clobbered
1672 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1674 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1675 i = (8-\num_initial_blocks)
1679 mov arg7, %r10 # r10 = AAD
1680 mov arg8, %r12 # r12 = aadLen
1685 vpxor reg_j, reg_j, reg_j
1686 vpxor reg_i, reg_i, reg_i
1691 vmovdqu (%r10), reg_i
1692 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1693 vpxor reg_i, reg_j, reg_j
1694 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1699 jge _get_AAD_blocks\@
1700 vmovdqu reg_j, reg_i
1704 vpxor reg_i, reg_i, reg_i
1706 /* read the last <16B of AAD. since we have at least 4B of
1707 data right after the AAD (the ICV, and maybe some CT), we can
1708 read 4B/8B blocks safely, and then get rid of the extra stuff */
1711 jle _get_AAD_rest4\@
1715 vpslldq $8, \T1, \T1
1716 vpsrldq $8, reg_i, reg_i
1717 vpxor \T1, reg_i, reg_i
1718 jmp _get_AAD_rest8\@
1721 jle _get_AAD_rest0\@
1726 vpslldq $12, \T1, \T1
1727 vpsrldq $4, reg_i, reg_i
1728 vpxor \T1, reg_i, reg_i
1730 /* finalize: shift out the extra bytes we read, and align
1731 left. since pslldq can only shift by an immediate, we use
1732 vpshufb and an array of shuffle masks */
1735 movdqu aad_shift_arr(%r11), \T1
1736 vpshufb \T1, reg_i, reg_i
1737 _get_AAD_rest_final\@:
1738 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1739 vpxor reg_j, reg_i, reg_i
1740 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1743 # initialize the data pointer offset as zero
1746 # start AES for num_initial_blocks blocks
1747 mov arg6, %rax # rax = *Y0
1748 vmovdqu (%rax), \CTR # CTR = Y0
1749 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1752 i = (9-\num_initial_blocks)
1754 .rep \num_initial_blocks
1755 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1757 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1762 vmovdqa (arg1), \T_key
1763 i = (9-\num_initial_blocks)
1765 .rep \num_initial_blocks
1766 vpxor \T_key, reg_i, reg_i
1774 vmovdqa 16*j(arg1), \T_key
1775 i = (9-\num_initial_blocks)
1777 .rep \num_initial_blocks
1778 vaesenc \T_key, reg_i, reg_i
1788 vmovdqa 16*10(arg1), \T_key
1789 i = (9-\num_initial_blocks)
1791 .rep \num_initial_blocks
1792 vaesenclast \T_key, reg_i, reg_i
1797 i = (9-\num_initial_blocks)
1799 .rep \num_initial_blocks
1800 vmovdqu (arg4, %r11), \T1
1801 vpxor \T1, reg_i, reg_i
1802 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1803 # num_initial_blocks blocks
1808 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1814 i = (8-\num_initial_blocks)
1815 j = (9-\num_initial_blocks)
1818 .rep \num_initial_blocks
1819 vpxor reg_i, reg_j, reg_j
1820 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1825 # XMM8 has the combined result here
1827 vmovdqa \XMM8, TMP1(%rsp)
1831 jl _initial_blocks_done\@ # no need for precomputed constants
1833 ###############################################################################
1834 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1835 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1837 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1839 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1841 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1843 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1845 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1847 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1849 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1851 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1853 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1855 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1857 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1859 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1861 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1863 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1865 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1867 vmovdqa (arg1), \T_key
1868 vpxor \T_key, \XMM1, \XMM1
1869 vpxor \T_key, \XMM2, \XMM2
1870 vpxor \T_key, \XMM3, \XMM3
1871 vpxor \T_key, \XMM4, \XMM4
1872 vpxor \T_key, \XMM5, \XMM5
1873 vpxor \T_key, \XMM6, \XMM6
1874 vpxor \T_key, \XMM7, \XMM7
1875 vpxor \T_key, \XMM8, \XMM8
1879 .rep 9 # do 9 rounds
1880 vmovdqa 16*i(arg1), \T_key
1881 vaesenc \T_key, \XMM1, \XMM1
1882 vaesenc \T_key, \XMM2, \XMM2
1883 vaesenc \T_key, \XMM3, \XMM3
1884 vaesenc \T_key, \XMM4, \XMM4
1885 vaesenc \T_key, \XMM5, \XMM5
1886 vaesenc \T_key, \XMM6, \XMM6
1887 vaesenc \T_key, \XMM7, \XMM7
1888 vaesenc \T_key, \XMM8, \XMM8
1894 vmovdqa 16*i(arg1), \T_key
1895 vaesenclast \T_key, \XMM1, \XMM1
1896 vaesenclast \T_key, \XMM2, \XMM2
1897 vaesenclast \T_key, \XMM3, \XMM3
1898 vaesenclast \T_key, \XMM4, \XMM4
1899 vaesenclast \T_key, \XMM5, \XMM5
1900 vaesenclast \T_key, \XMM6, \XMM6
1901 vaesenclast \T_key, \XMM7, \XMM7
1902 vaesenclast \T_key, \XMM8, \XMM8
1904 vmovdqu (arg4, %r11), \T1
1905 vpxor \T1, \XMM1, \XMM1
1906 vmovdqu \XMM1, (arg3 , %r11)
1911 vmovdqu 16*1(arg4, %r11), \T1
1912 vpxor \T1, \XMM2, \XMM2
1913 vmovdqu \XMM2, 16*1(arg3 , %r11)
1918 vmovdqu 16*2(arg4, %r11), \T1
1919 vpxor \T1, \XMM3, \XMM3
1920 vmovdqu \XMM3, 16*2(arg3 , %r11)
1925 vmovdqu 16*3(arg4, %r11), \T1
1926 vpxor \T1, \XMM4, \XMM4
1927 vmovdqu \XMM4, 16*3(arg3 , %r11)
1932 vmovdqu 16*4(arg4, %r11), \T1
1933 vpxor \T1, \XMM5, \XMM5
1934 vmovdqu \XMM5, 16*4(arg3 , %r11)
1939 vmovdqu 16*5(arg4, %r11), \T1
1940 vpxor \T1, \XMM6, \XMM6
1941 vmovdqu \XMM6, 16*5(arg3 , %r11)
1946 vmovdqu 16*6(arg4, %r11), \T1
1947 vpxor \T1, \XMM7, \XMM7
1948 vmovdqu \XMM7, 16*6(arg3 , %r11)
1953 vmovdqu 16*7(arg4, %r11), \T1
1954 vpxor \T1, \XMM8, \XMM8
1955 vmovdqu \XMM8, 16*7(arg3 , %r11)
1962 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1963 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1964 # the corresponding ciphertext
1965 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1966 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1967 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1968 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1969 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1970 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1971 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1973 ###############################################################################
1975 _initial_blocks_done\@:
1982 # encrypt 8 blocks at a time
1983 # ghash the 8 previously encrypted ciphertext blocks
1984 # arg1, arg3, arg4 are used as pointers only, not modified
1985 # r11 is the data offset value
1986 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1989 vmovdqa \XMM2, TMP2(%rsp)
1990 vmovdqa \XMM3, TMP3(%rsp)
1991 vmovdqa \XMM4, TMP4(%rsp)
1992 vmovdqa \XMM5, TMP5(%rsp)
1993 vmovdqa \XMM6, TMP6(%rsp)
1994 vmovdqa \XMM7, TMP7(%rsp)
1995 vmovdqa \XMM8, TMP8(%rsp)
1997 .if \loop_idx == in_order
1998 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1999 vpaddd ONE(%rip), \XMM1, \XMM2
2000 vpaddd ONE(%rip), \XMM2, \XMM3
2001 vpaddd ONE(%rip), \XMM3, \XMM4
2002 vpaddd ONE(%rip), \XMM4, \XMM5
2003 vpaddd ONE(%rip), \XMM5, \XMM6
2004 vpaddd ONE(%rip), \XMM6, \XMM7
2005 vpaddd ONE(%rip), \XMM7, \XMM8
2008 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2009 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2010 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2011 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2012 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2013 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2014 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2015 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2017 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2018 vpaddd ONEf(%rip), \XMM1, \XMM2
2019 vpaddd ONEf(%rip), \XMM2, \XMM3
2020 vpaddd ONEf(%rip), \XMM3, \XMM4
2021 vpaddd ONEf(%rip), \XMM4, \XMM5
2022 vpaddd ONEf(%rip), \XMM5, \XMM6
2023 vpaddd ONEf(%rip), \XMM6, \XMM7
2024 vpaddd ONEf(%rip), \XMM7, \XMM8
2029 #######################################################################
2032 vpxor \T1, \XMM1, \XMM1
2033 vpxor \T1, \XMM2, \XMM2
2034 vpxor \T1, \XMM3, \XMM3
2035 vpxor \T1, \XMM4, \XMM4
2036 vpxor \T1, \XMM5, \XMM5
2037 vpxor \T1, \XMM6, \XMM6
2038 vpxor \T1, \XMM7, \XMM7
2039 vpxor \T1, \XMM8, \XMM8
2041 #######################################################################
2047 vmovdqu 16*1(arg1), \T1
2048 vaesenc \T1, \XMM1, \XMM1
2049 vaesenc \T1, \XMM2, \XMM2
2050 vaesenc \T1, \XMM3, \XMM3
2051 vaesenc \T1, \XMM4, \XMM4
2052 vaesenc \T1, \XMM5, \XMM5
2053 vaesenc \T1, \XMM6, \XMM6
2054 vaesenc \T1, \XMM7, \XMM7
2055 vaesenc \T1, \XMM8, \XMM8
2057 vmovdqu 16*2(arg1), \T1
2058 vaesenc \T1, \XMM1, \XMM1
2059 vaesenc \T1, \XMM2, \XMM2
2060 vaesenc \T1, \XMM3, \XMM3
2061 vaesenc \T1, \XMM4, \XMM4
2062 vaesenc \T1, \XMM5, \XMM5
2063 vaesenc \T1, \XMM6, \XMM6
2064 vaesenc \T1, \XMM7, \XMM7
2065 vaesenc \T1, \XMM8, \XMM8
2068 #######################################################################
2070 vmovdqu HashKey_8(arg2), \T5
2071 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2072 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2073 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2074 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2077 vmovdqu 16*3(arg1), \T1
2078 vaesenc \T1, \XMM1, \XMM1
2079 vaesenc \T1, \XMM2, \XMM2
2080 vaesenc \T1, \XMM3, \XMM3
2081 vaesenc \T1, \XMM4, \XMM4
2082 vaesenc \T1, \XMM5, \XMM5
2083 vaesenc \T1, \XMM6, \XMM6
2084 vaesenc \T1, \XMM7, \XMM7
2085 vaesenc \T1, \XMM8, \XMM8
2087 vmovdqa TMP2(%rsp), \T1
2088 vmovdqu HashKey_7(arg2), \T5
2089 vpclmulqdq $0x11, \T5, \T1, \T3
2092 vpclmulqdq $0x00, \T5, \T1, \T3
2095 vpclmulqdq $0x01, \T5, \T1, \T3
2098 vpclmulqdq $0x10, \T5, \T1, \T3
2101 vmovdqu 16*4(arg1), \T1
2102 vaesenc \T1, \XMM1, \XMM1
2103 vaesenc \T1, \XMM2, \XMM2
2104 vaesenc \T1, \XMM3, \XMM3
2105 vaesenc \T1, \XMM4, \XMM4
2106 vaesenc \T1, \XMM5, \XMM5
2107 vaesenc \T1, \XMM6, \XMM6
2108 vaesenc \T1, \XMM7, \XMM7
2109 vaesenc \T1, \XMM8, \XMM8
2111 #######################################################################
2113 vmovdqa TMP3(%rsp), \T1
2114 vmovdqu HashKey_6(arg2), \T5
2115 vpclmulqdq $0x11, \T5, \T1, \T3
2118 vpclmulqdq $0x00, \T5, \T1, \T3
2121 vpclmulqdq $0x01, \T5, \T1, \T3
2124 vpclmulqdq $0x10, \T5, \T1, \T3
2127 vmovdqu 16*5(arg1), \T1
2128 vaesenc \T1, \XMM1, \XMM1
2129 vaesenc \T1, \XMM2, \XMM2
2130 vaesenc \T1, \XMM3, \XMM3
2131 vaesenc \T1, \XMM4, \XMM4
2132 vaesenc \T1, \XMM5, \XMM5
2133 vaesenc \T1, \XMM6, \XMM6
2134 vaesenc \T1, \XMM7, \XMM7
2135 vaesenc \T1, \XMM8, \XMM8
2137 vmovdqa TMP4(%rsp), \T1
2138 vmovdqu HashKey_5(arg2), \T5
2139 vpclmulqdq $0x11, \T5, \T1, \T3
2142 vpclmulqdq $0x00, \T5, \T1, \T3
2145 vpclmulqdq $0x01, \T5, \T1, \T3
2148 vpclmulqdq $0x10, \T5, \T1, \T3
2151 vmovdqu 16*6(arg1), \T1
2152 vaesenc \T1, \XMM1, \XMM1
2153 vaesenc \T1, \XMM2, \XMM2
2154 vaesenc \T1, \XMM3, \XMM3
2155 vaesenc \T1, \XMM4, \XMM4
2156 vaesenc \T1, \XMM5, \XMM5
2157 vaesenc \T1, \XMM6, \XMM6
2158 vaesenc \T1, \XMM7, \XMM7
2159 vaesenc \T1, \XMM8, \XMM8
2162 vmovdqa TMP5(%rsp), \T1
2163 vmovdqu HashKey_4(arg2), \T5
2164 vpclmulqdq $0x11, \T5, \T1, \T3
2167 vpclmulqdq $0x00, \T5, \T1, \T3
2170 vpclmulqdq $0x01, \T5, \T1, \T3
2173 vpclmulqdq $0x10, \T5, \T1, \T3
2176 vmovdqu 16*7(arg1), \T1
2177 vaesenc \T1, \XMM1, \XMM1
2178 vaesenc \T1, \XMM2, \XMM2
2179 vaesenc \T1, \XMM3, \XMM3
2180 vaesenc \T1, \XMM4, \XMM4
2181 vaesenc \T1, \XMM5, \XMM5
2182 vaesenc \T1, \XMM6, \XMM6
2183 vaesenc \T1, \XMM7, \XMM7
2184 vaesenc \T1, \XMM8, \XMM8
2186 vmovdqa TMP6(%rsp), \T1
2187 vmovdqu HashKey_3(arg2), \T5
2188 vpclmulqdq $0x11, \T5, \T1, \T3
2191 vpclmulqdq $0x00, \T5, \T1, \T3
2194 vpclmulqdq $0x01, \T5, \T1, \T3
2197 vpclmulqdq $0x10, \T5, \T1, \T3
2200 vmovdqu 16*8(arg1), \T1
2201 vaesenc \T1, \XMM1, \XMM1
2202 vaesenc \T1, \XMM2, \XMM2
2203 vaesenc \T1, \XMM3, \XMM3
2204 vaesenc \T1, \XMM4, \XMM4
2205 vaesenc \T1, \XMM5, \XMM5
2206 vaesenc \T1, \XMM6, \XMM6
2207 vaesenc \T1, \XMM7, \XMM7
2208 vaesenc \T1, \XMM8, \XMM8
2210 vmovdqa TMP7(%rsp), \T1
2211 vmovdqu HashKey_2(arg2), \T5
2212 vpclmulqdq $0x11, \T5, \T1, \T3
2215 vpclmulqdq $0x00, \T5, \T1, \T3
2218 vpclmulqdq $0x01, \T5, \T1, \T3
2221 vpclmulqdq $0x10, \T5, \T1, \T3
2225 #######################################################################
2227 vmovdqu 16*9(arg1), \T5
2228 vaesenc \T5, \XMM1, \XMM1
2229 vaesenc \T5, \XMM2, \XMM2
2230 vaesenc \T5, \XMM3, \XMM3
2231 vaesenc \T5, \XMM4, \XMM4
2232 vaesenc \T5, \XMM5, \XMM5
2233 vaesenc \T5, \XMM6, \XMM6
2234 vaesenc \T5, \XMM7, \XMM7
2235 vaesenc \T5, \XMM8, \XMM8
2237 vmovdqa TMP8(%rsp), \T1
2238 vmovdqu HashKey(arg2), \T5
2240 vpclmulqdq $0x00, \T5, \T1, \T3
2243 vpclmulqdq $0x01, \T5, \T1, \T3
2246 vpclmulqdq $0x10, \T5, \T1, \T3
2249 vpclmulqdq $0x11, \T5, \T1, \T3
2253 vmovdqu 16*10(arg1), \T5
2259 vpxor 16*i(arg4, %r11), \T5, \T2
2261 vaesenclast \T2, reg_j, reg_j
2263 vaesenclast \T2, reg_j, \T3
2264 vmovdqu 16*i(arg4, %r11), reg_j
2265 vmovdqu \T3, 16*i(arg3, %r11)
2271 #######################################################################
2274 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2275 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2277 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2281 #######################################################################
2282 #first phase of the reduction
2283 vmovdqa POLY2(%rip), \T3
2285 vpclmulqdq $0x01, \T7, \T3, \T2
2286 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2288 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2289 #######################################################################
2291 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2292 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2293 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2294 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2295 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2296 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2297 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2298 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2301 #######################################################################
2302 #second phase of the reduction
2303 vpclmulqdq $0x00, \T7, \T3, \T2
2304 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2306 vpclmulqdq $0x10, \T7, \T3, \T4
2307 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2309 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2310 #######################################################################
2311 vpxor \T4, \T1, \T1 # the result is in T1
2313 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2314 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2315 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2316 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2317 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2318 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2319 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2320 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2323 vpxor \T1, \XMM1, \XMM1
2330 # GHASH the last 4 ciphertext blocks.
2331 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2335 vmovdqu HashKey_8(arg2), \T5
2337 vpshufd $0b01001110, \XMM1, \T2
2338 vpshufd $0b01001110, \T5, \T3
2339 vpxor \XMM1, \T2, \T2
2342 vpclmulqdq $0x11, \T5, \XMM1, \T6
2343 vpclmulqdq $0x00, \T5, \XMM1, \T7
2345 vpclmulqdq $0x00, \T3, \T2, \XMM1
2347 ######################
2349 vmovdqu HashKey_7(arg2), \T5
2350 vpshufd $0b01001110, \XMM2, \T2
2351 vpshufd $0b01001110, \T5, \T3
2352 vpxor \XMM2, \T2, \T2
2355 vpclmulqdq $0x11, \T5, \XMM2, \T4
2358 vpclmulqdq $0x00, \T5, \XMM2, \T4
2361 vpclmulqdq $0x00, \T3, \T2, \T2
2363 vpxor \T2, \XMM1, \XMM1
2365 ######################
2367 vmovdqu HashKey_6(arg2), \T5
2368 vpshufd $0b01001110, \XMM3, \T2
2369 vpshufd $0b01001110, \T5, \T3
2370 vpxor \XMM3, \T2, \T2
2373 vpclmulqdq $0x11, \T5, \XMM3, \T4
2376 vpclmulqdq $0x00, \T5, \XMM3, \T4
2379 vpclmulqdq $0x00, \T3, \T2, \T2
2381 vpxor \T2, \XMM1, \XMM1
2383 ######################
2385 vmovdqu HashKey_5(arg2), \T5
2386 vpshufd $0b01001110, \XMM4, \T2
2387 vpshufd $0b01001110, \T5, \T3
2388 vpxor \XMM4, \T2, \T2
2391 vpclmulqdq $0x11, \T5, \XMM4, \T4
2394 vpclmulqdq $0x00, \T5, \XMM4, \T4
2397 vpclmulqdq $0x00, \T3, \T2, \T2
2399 vpxor \T2, \XMM1, \XMM1
2401 ######################
2403 vmovdqu HashKey_4(arg2), \T5
2404 vpshufd $0b01001110, \XMM5, \T2
2405 vpshufd $0b01001110, \T5, \T3
2406 vpxor \XMM5, \T2, \T2
2409 vpclmulqdq $0x11, \T5, \XMM5, \T4
2412 vpclmulqdq $0x00, \T5, \XMM5, \T4
2415 vpclmulqdq $0x00, \T3, \T2, \T2
2417 vpxor \T2, \XMM1, \XMM1
2419 ######################
2421 vmovdqu HashKey_3(arg2), \T5
2422 vpshufd $0b01001110, \XMM6, \T2
2423 vpshufd $0b01001110, \T5, \T3
2424 vpxor \XMM6, \T2, \T2
2427 vpclmulqdq $0x11, \T5, \XMM6, \T4
2430 vpclmulqdq $0x00, \T5, \XMM6, \T4
2433 vpclmulqdq $0x00, \T3, \T2, \T2
2435 vpxor \T2, \XMM1, \XMM1
2437 ######################
2439 vmovdqu HashKey_2(arg2), \T5
2440 vpshufd $0b01001110, \XMM7, \T2
2441 vpshufd $0b01001110, \T5, \T3
2442 vpxor \XMM7, \T2, \T2
2445 vpclmulqdq $0x11, \T5, \XMM7, \T4
2448 vpclmulqdq $0x00, \T5, \XMM7, \T4
2451 vpclmulqdq $0x00, \T3, \T2, \T2
2453 vpxor \T2, \XMM1, \XMM1
2455 ######################
2457 vmovdqu HashKey(arg2), \T5
2458 vpshufd $0b01001110, \XMM8, \T2
2459 vpshufd $0b01001110, \T5, \T3
2460 vpxor \XMM8, \T2, \T2
2463 vpclmulqdq $0x11, \T5, \XMM8, \T4
2466 vpclmulqdq $0x00, \T5, \XMM8, \T4
2469 vpclmulqdq $0x00, \T3, \T2, \T2
2471 vpxor \T2, \XMM1, \XMM1
2472 vpxor \T6, \XMM1, \XMM1
2473 vpxor \T7, \XMM1, \T2
2478 vpslldq $8, \T2, \T4
2479 vpsrldq $8, \T2, \T2
2482 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2483 # accumulated carry-less multiplications
2485 #######################################################################
2486 #first phase of the reduction
2487 vmovdqa POLY2(%rip), \T3
2489 vpclmulqdq $0x01, \T7, \T3, \T2
2490 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2492 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2493 #######################################################################
2496 #second phase of the reduction
2497 vpclmulqdq $0x00, \T7, \T3, \T2
2498 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2500 vpclmulqdq $0x10, \T7, \T3, \T4
2501 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2503 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2504 #######################################################################
2505 vpxor \T4, \T6, \T6 # the result is in T6
2510 #############################################################
2511 #void aesni_gcm_precomp_avx_gen4
2512 # (gcm_data *my_ctx_data,
2513 # gcm_context_data *data,
2514 # u8 *hash_subkey)# /* H, the Hash sub key input.
2515 # Data starts on a 16-byte boundary. */
2516 #############################################################
2517 ENTRY(aesni_gcm_precomp_avx_gen4)
2520 vmovdqu (arg3), %xmm6 # xmm6 = HashKey
2522 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2523 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2524 vmovdqa %xmm6, %xmm2
2525 vpsllq $1, %xmm6, %xmm6
2526 vpsrlq $63, %xmm2, %xmm2
2527 vmovdqa %xmm2, %xmm1
2528 vpslldq $8, %xmm2, %xmm2
2529 vpsrldq $8, %xmm1, %xmm1
2530 vpor %xmm2, %xmm6, %xmm6
2532 vpshufd $0b00100100, %xmm1, %xmm2
2533 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2534 vpand POLY(%rip), %xmm2, %xmm2
2535 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2536 #######################################################################
2537 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
2540 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2544 ENDPROC(aesni_gcm_precomp_avx_gen4)
2547 ###############################################################################
2548 #void aesni_gcm_enc_avx_gen4(
2549 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2550 # gcm_context_data *data,
2551 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2552 # const u8 *in, /* Plaintext input */
2553 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2554 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2555 # (from Security Association) concatenated with 8 byte
2556 # Initialisation Vector (from IPSec ESP Payload)
2557 # concatenated with 0x00000001. 16-byte aligned pointer. */
2558 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2559 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2560 # u8 *auth_tag, /* Authenticated Tag output. */
2561 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2562 # Valid values are 16 (most likely), 12 or 8. */
2563 ###############################################################################
2564 ENTRY(aesni_gcm_enc_avx_gen4)
2566 GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC
2569 ENDPROC(aesni_gcm_enc_avx_gen4)
2571 ###############################################################################
2572 #void aesni_gcm_dec_avx_gen4(
2573 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2574 # gcm_context_data *data,
2575 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2576 # const u8 *in, /* Ciphertext input */
2577 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2578 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2579 # (from Security Association) concatenated with 8 byte
2580 # Initialisation Vector (from IPSec ESP Payload)
2581 # concatenated with 0x00000001. 16-byte aligned pointer. */
2582 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2583 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2584 # u8 *auth_tag, /* Authenticated Tag output. */
2585 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2586 # Valid values are 16 (most likely), 12 or 8. */
2587 ###############################################################################
2588 ENTRY(aesni_gcm_dec_avx_gen4)
2590 GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC
2593 ENDPROC(aesni_gcm_dec_avx_gen4)
2595 #endif /* CONFIG_AS_AVX2 */