1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
187 #define InLen (16*1)+8
188 #define PBlockEncKey 16*2
190 #define CurCount 16*4
191 #define PBlockLen 16*5
193 HashKey = 16*6 # store HashKey <<1 mod poly here
194 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
195 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
196 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
197 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
198 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
199 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
200 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
201 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
216 #define arg7 STACK_OFFSET+8*1(%r14)
217 #define arg8 STACK_OFFSET+8*2(%r14)
218 #define arg9 STACK_OFFSET+8*3(%r14)
219 #define arg10 STACK_OFFSET+8*4(%r14)
220 #define keysize 2*15*16(arg1)
230 .macro define_reg r n
241 # need to push 4 registers into stack to maintain
244 TMP1 = 16*0 # Temporary storage for AAD
245 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246 TMP3 = 16*2 # Temporary storage for AES State 3
247 TMP4 = 16*3 # Temporary storage for AES State 4
248 TMP5 = 16*4 # Temporary storage for AES State 5
249 TMP6 = 16*5 # Temporary storage for AES State 6
250 TMP7 = 16*6 # Temporary storage for AES State 7
251 TMP8 = 16*7 # Temporary storage for AES State 8
253 VARIABLE_OFFSET = 16*8
255 ################################
257 ################################
260 #the number of pushes must equal STACK_OFFSET
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp # align rsp to 64 bytes
283 # Encryption of a single block
284 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
289 vaesenc 16*i(arg1), \XMM0, \XMM0
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
296 # combined for GCM encrypt and decrypt functions
297 # clobbering all xmm registers
298 # clobbering r10, r11, r12, r13, r14, r15
299 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
302 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
303 and $-16, %r13 # r13 = r13 - (r13 mod 16)
308 jz _initial_num_blocks_is_0\@
311 je _initial_num_blocks_is_7\@
313 je _initial_num_blocks_is_6\@
315 je _initial_num_blocks_is_5\@
317 je _initial_num_blocks_is_4\@
319 je _initial_num_blocks_is_3\@
321 je _initial_num_blocks_is_2\@
323 jmp _initial_num_blocks_is_1\@
325 _initial_num_blocks_is_7\@:
326 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
328 jmp _initial_blocks_encrypted\@
330 _initial_num_blocks_is_6\@:
331 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
333 jmp _initial_blocks_encrypted\@
335 _initial_num_blocks_is_5\@:
336 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
338 jmp _initial_blocks_encrypted\@
340 _initial_num_blocks_is_4\@:
341 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
343 jmp _initial_blocks_encrypted\@
345 _initial_num_blocks_is_3\@:
346 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
348 jmp _initial_blocks_encrypted\@
350 _initial_num_blocks_is_2\@:
351 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
353 jmp _initial_blocks_encrypted\@
355 _initial_num_blocks_is_1\@:
356 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
358 jmp _initial_blocks_encrypted\@
360 _initial_num_blocks_is_0\@:
361 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
364 _initial_blocks_encrypted\@:
366 je _zero_cipher_left\@
369 je _eight_cipher_left\@
376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
386 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
389 jne _encrypt_by_8_new\@
391 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
392 jmp _eight_cipher_left\@
395 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
397 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
398 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
401 jne _encrypt_by_8_new\@
403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
408 _eight_cipher_left\@:
409 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
414 jl _only_less_than_16\@
417 and $15, %r13 # r13 = (arg5 mod 16)
419 je _multiple_of_16_bytes\@
421 # handle the last <16 Byte block seperately
424 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
425 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
426 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
430 vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block
432 lea SHIFT_MASK+16(%rip), %r12
433 sub %r13, %r12 # adjust the shuffle mask pointer to be
434 # able to shift 16-r13 bytes (r13 is the
435 # number of bytes in plaintext mod 16)
436 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
437 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
438 jmp _final_ghash_mul\@
440 _only_less_than_16\@:
443 and $15, %r13 # r13 = (arg5 mod 16)
445 je _multiple_of_16_bytes\@
447 # handle the last <16 Byte block separately
450 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
451 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
452 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
455 lea SHIFT_MASK+16(%rip), %r12
456 sub %r13, %r12 # adjust the shuffle mask pointer to be
457 # able to shift 16-r13 bytes (r13 is the
458 # number of bytes in plaintext mod 16)
460 _get_last_16_byte_loop\@:
461 movb (arg4, %r11), %al
462 movb %al, TMP1 (%rsp , %r11)
465 jne _get_last_16_byte_loop\@
467 vmovdqu TMP1(%rsp), %xmm1
474 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
475 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
476 # mask out top 16-r13 bytes of xmm9
477 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
478 vpand %xmm1, %xmm2, %xmm2
479 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
480 vpxor %xmm2, %xmm14, %xmm14
481 #GHASH computation for the last <16 Byte block
482 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
486 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
487 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
488 # mask out top 16-r13 bytes of xmm9
489 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
490 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
491 vpxor %xmm9, %xmm14, %xmm14
492 #GHASH computation for the last <16 Byte block
493 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
500 #############################
504 jle _less_than_8_bytes_left\@
506 mov %rax, (arg3 , %r11)
508 vpsrldq $8, %xmm9, %xmm9
512 _less_than_8_bytes_left\@:
513 movb %al, (arg3 , %r11)
517 jne _less_than_8_bytes_left\@
518 #############################
520 _multiple_of_16_bytes\@:
521 GCM_COMPLETE \GHASH_MUL \REP
525 # GCM_COMPLETE Finishes update of tag of last partial block
526 # Output: Authorization Tag (AUTH_TAG)
527 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
528 .macro GCM_COMPLETE GHASH_MUL REP
529 mov arg8, %r12 # r12 = aadLen (number of bytes)
530 shl $3, %r12 # convert into number of bits
531 vmovd %r12d, %xmm15 # len(A) in xmm15
533 shl $3, arg5 # len(C) in bits (*128)
535 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
536 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
538 vpxor %xmm15, %xmm14, %xmm14
539 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
540 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
542 mov arg6, %rax # rax = *Y0
543 vmovdqu (%rax), %xmm9 # xmm9 = Y0
545 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
547 vpxor %xmm14, %xmm9, %xmm9
552 mov arg9, %r10 # r10 = authTag
553 mov arg10, %r11 # r11 = auth_tag_len
566 vpsrldq $8, %xmm9, %xmm9
574 vpsrldq $4, %xmm9, %xmm9
591 vmovdqu %xmm9, (%r10)
596 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
598 mov \AAD, %r10 # r10 = AAD
599 mov \AADLEN, %r12 # r12 = aadLen
610 vpshufb SHUF_MASK(%rip), \T7, \T7
612 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
617 jge _get_AAD_blocks\@
624 /* read the last <16B of AAD. since we have at least 4B of
625 data right after the AAD (the ICV, and maybe some CT), we can
626 read 4B/8B blocks safely, and then get rid of the extra stuff */
644 vpslldq $12, \T1, \T1
648 /* finalize: shift out the extra bytes we read, and align
649 left. since pslldq can only shift by an immediate, we use
650 vpshufb and an array of shuffle masks */
653 vmovdqu aad_shift_arr(%r11), \T1
654 vpshufb \T1, \T7, \T7
655 _get_AAD_rest_final\@:
656 vpshufb SHUF_MASK(%rip), \T7, \T7
658 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
661 vmovdqu \T7, AadHash(arg2)
665 ###############################################################################
666 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
667 # Input: A and B (128-bits each, bit-reflected)
668 # Output: C = A*B*x mod poly, (i.e. >>1 )
669 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
670 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
671 ###############################################################################
672 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
674 vpshufd $0b01001110, \GH, \T2
675 vpshufd $0b01001110, \HK, \T3
676 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
677 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
679 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
680 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
681 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
683 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
685 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
686 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
688 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
690 #first phase of the reduction
691 vpslld $31, \GH, \T2 # packed right shifting << 31
692 vpslld $30, \GH, \T3 # packed right shifting shift << 30
693 vpslld $25, \GH, \T4 # packed right shifting shift << 25
695 vpxor \T3, \T2, \T2 # xor the shifted versions
698 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
700 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
701 vpxor \T2, \GH, \GH # first phase of the reduction complete
703 #second phase of the reduction
705 vpsrld $1,\GH, \T2 # packed left shifting >> 1
706 vpsrld $2,\GH, \T3 # packed left shifting >> 2
707 vpsrld $7,\GH, \T4 # packed left shifting >> 7
708 vpxor \T3, \T2, \T2 # xor the shifted versions
713 vpxor \T1, \GH, \GH # the result is in GH
718 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
720 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
723 vpshufd $0b01001110, \T5, \T1
725 vmovdqu \T1, HashKey_k(arg2)
727 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
728 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
729 vpshufd $0b01001110, \T5, \T1
731 vmovdqu \T1, HashKey_2_k(arg2)
733 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
734 vmovdqu \T5, HashKey_3(arg2)
735 vpshufd $0b01001110, \T5, \T1
737 vmovdqu \T1, HashKey_3_k(arg2)
739 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
740 vmovdqu \T5, HashKey_4(arg2)
741 vpshufd $0b01001110, \T5, \T1
743 vmovdqu \T1, HashKey_4_k(arg2)
745 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
746 vmovdqu \T5, HashKey_5(arg2)
747 vpshufd $0b01001110, \T5, \T1
749 vmovdqu \T1, HashKey_5_k(arg2)
751 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
752 vmovdqu \T5, HashKey_6(arg2)
753 vpshufd $0b01001110, \T5, \T1
755 vmovdqu \T1, HashKey_6_k(arg2)
757 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
758 vmovdqu \T5, HashKey_7(arg2)
759 vpshufd $0b01001110, \T5, \T1
761 vmovdqu \T1, HashKey_7_k(arg2)
763 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
764 vmovdqu \T5, HashKey_8(arg2)
765 vpshufd $0b01001110, \T5, \T1
767 vmovdqu \T1, HashKey_8_k(arg2)
771 ## if a = number of total plaintext bytes
773 ## num_initial_blocks = b mod 4#
774 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
775 ## r10, r11, r12, rax are clobbered
776 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
778 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
779 i = (8-\num_initial_blocks)
781 vmovdqu AadHash(arg2), reg_i
783 # initialize the data pointer offset as zero
786 # start AES for num_initial_blocks blocks
787 mov arg6, %rax # rax = *Y0
788 vmovdqu (%rax), \CTR # CTR = Y0
789 vpshufb SHUF_MASK(%rip), \CTR, \CTR
792 i = (9-\num_initial_blocks)
794 .rep \num_initial_blocks
795 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
797 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
802 vmovdqa (arg1), \T_key
803 i = (9-\num_initial_blocks)
805 .rep \num_initial_blocks
806 vpxor \T_key, reg_i, reg_i
814 vmovdqa 16*j(arg1), \T_key
815 i = (9-\num_initial_blocks)
817 .rep \num_initial_blocks
818 vaesenc \T_key, reg_i, reg_i
827 vmovdqa 16*j(arg1), \T_key
828 i = (9-\num_initial_blocks)
830 .rep \num_initial_blocks
831 vaesenclast \T_key, reg_i, reg_i
836 i = (9-\num_initial_blocks)
838 .rep \num_initial_blocks
839 vmovdqu (arg4, %r11), \T1
840 vpxor \T1, reg_i, reg_i
841 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
846 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
852 i = (8-\num_initial_blocks)
853 j = (9-\num_initial_blocks)
856 .rep \num_initial_blocks
857 vpxor reg_i, reg_j, reg_j
858 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
863 # XMM8 has the combined result here
865 vmovdqa \XMM8, TMP1(%rsp)
869 jl _initial_blocks_done\@ # no need for precomputed constants
871 ###############################################################################
872 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
873 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
875 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
877 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
879 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
881 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
883 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
885 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
887 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
889 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
891 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
893 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
895 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
897 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
899 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
901 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
903 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
905 vmovdqa (arg1), \T_key
906 vpxor \T_key, \XMM1, \XMM1
907 vpxor \T_key, \XMM2, \XMM2
908 vpxor \T_key, \XMM3, \XMM3
909 vpxor \T_key, \XMM4, \XMM4
910 vpxor \T_key, \XMM5, \XMM5
911 vpxor \T_key, \XMM6, \XMM6
912 vpxor \T_key, \XMM7, \XMM7
913 vpxor \T_key, \XMM8, \XMM8
917 .rep \REP # do REP rounds
918 vmovdqa 16*i(arg1), \T_key
919 vaesenc \T_key, \XMM1, \XMM1
920 vaesenc \T_key, \XMM2, \XMM2
921 vaesenc \T_key, \XMM3, \XMM3
922 vaesenc \T_key, \XMM4, \XMM4
923 vaesenc \T_key, \XMM5, \XMM5
924 vaesenc \T_key, \XMM6, \XMM6
925 vaesenc \T_key, \XMM7, \XMM7
926 vaesenc \T_key, \XMM8, \XMM8
931 vmovdqa 16*i(arg1), \T_key
932 vaesenclast \T_key, \XMM1, \XMM1
933 vaesenclast \T_key, \XMM2, \XMM2
934 vaesenclast \T_key, \XMM3, \XMM3
935 vaesenclast \T_key, \XMM4, \XMM4
936 vaesenclast \T_key, \XMM5, \XMM5
937 vaesenclast \T_key, \XMM6, \XMM6
938 vaesenclast \T_key, \XMM7, \XMM7
939 vaesenclast \T_key, \XMM8, \XMM8
941 vmovdqu (arg4, %r11), \T1
942 vpxor \T1, \XMM1, \XMM1
943 vmovdqu \XMM1, (arg3 , %r11)
948 vmovdqu 16*1(arg4, %r11), \T1
949 vpxor \T1, \XMM2, \XMM2
950 vmovdqu \XMM2, 16*1(arg3 , %r11)
955 vmovdqu 16*2(arg4, %r11), \T1
956 vpxor \T1, \XMM3, \XMM3
957 vmovdqu \XMM3, 16*2(arg3 , %r11)
962 vmovdqu 16*3(arg4, %r11), \T1
963 vpxor \T1, \XMM4, \XMM4
964 vmovdqu \XMM4, 16*3(arg3 , %r11)
969 vmovdqu 16*4(arg4, %r11), \T1
970 vpxor \T1, \XMM5, \XMM5
971 vmovdqu \XMM5, 16*4(arg3 , %r11)
976 vmovdqu 16*5(arg4, %r11), \T1
977 vpxor \T1, \XMM6, \XMM6
978 vmovdqu \XMM6, 16*5(arg3 , %r11)
983 vmovdqu 16*6(arg4, %r11), \T1
984 vpxor \T1, \XMM7, \XMM7
985 vmovdqu \XMM7, 16*6(arg3 , %r11)
990 vmovdqu 16*7(arg4, %r11), \T1
991 vpxor \T1, \XMM8, \XMM8
992 vmovdqu \XMM8, 16*7(arg3 , %r11)
999 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1000 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1001 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1002 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1003 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1004 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1005 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1006 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1007 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1009 ###############################################################################
1011 _initial_blocks_done\@:
1015 # encrypt 8 blocks at a time
1016 # ghash the 8 previously encrypted ciphertext blocks
1017 # arg1, arg3, arg4 are used as pointers only, not modified
1018 # r11 is the data offset value
1019 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1022 vmovdqa \XMM2, TMP2(%rsp)
1023 vmovdqa \XMM3, TMP3(%rsp)
1024 vmovdqa \XMM4, TMP4(%rsp)
1025 vmovdqa \XMM5, TMP5(%rsp)
1026 vmovdqa \XMM6, TMP6(%rsp)
1027 vmovdqa \XMM7, TMP7(%rsp)
1028 vmovdqa \XMM8, TMP8(%rsp)
1030 .if \loop_idx == in_order
1031 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1032 vpaddd ONE(%rip), \XMM1, \XMM2
1033 vpaddd ONE(%rip), \XMM2, \XMM3
1034 vpaddd ONE(%rip), \XMM3, \XMM4
1035 vpaddd ONE(%rip), \XMM4, \XMM5
1036 vpaddd ONE(%rip), \XMM5, \XMM6
1037 vpaddd ONE(%rip), \XMM6, \XMM7
1038 vpaddd ONE(%rip), \XMM7, \XMM8
1041 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1042 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1043 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1044 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1045 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1046 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1047 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1048 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1050 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1051 vpaddd ONEf(%rip), \XMM1, \XMM2
1052 vpaddd ONEf(%rip), \XMM2, \XMM3
1053 vpaddd ONEf(%rip), \XMM3, \XMM4
1054 vpaddd ONEf(%rip), \XMM4, \XMM5
1055 vpaddd ONEf(%rip), \XMM5, \XMM6
1056 vpaddd ONEf(%rip), \XMM6, \XMM7
1057 vpaddd ONEf(%rip), \XMM7, \XMM8
1062 #######################################################################
1065 vpxor \T1, \XMM1, \XMM1
1066 vpxor \T1, \XMM2, \XMM2
1067 vpxor \T1, \XMM3, \XMM3
1068 vpxor \T1, \XMM4, \XMM4
1069 vpxor \T1, \XMM5, \XMM5
1070 vpxor \T1, \XMM6, \XMM6
1071 vpxor \T1, \XMM7, \XMM7
1072 vpxor \T1, \XMM8, \XMM8
1074 #######################################################################
1080 vmovdqu 16*1(arg1), \T1
1081 vaesenc \T1, \XMM1, \XMM1
1082 vaesenc \T1, \XMM2, \XMM2
1083 vaesenc \T1, \XMM3, \XMM3
1084 vaesenc \T1, \XMM4, \XMM4
1085 vaesenc \T1, \XMM5, \XMM5
1086 vaesenc \T1, \XMM6, \XMM6
1087 vaesenc \T1, \XMM7, \XMM7
1088 vaesenc \T1, \XMM8, \XMM8
1090 vmovdqu 16*2(arg1), \T1
1091 vaesenc \T1, \XMM1, \XMM1
1092 vaesenc \T1, \XMM2, \XMM2
1093 vaesenc \T1, \XMM3, \XMM3
1094 vaesenc \T1, \XMM4, \XMM4
1095 vaesenc \T1, \XMM5, \XMM5
1096 vaesenc \T1, \XMM6, \XMM6
1097 vaesenc \T1, \XMM7, \XMM7
1098 vaesenc \T1, \XMM8, \XMM8
1101 #######################################################################
1103 vmovdqu HashKey_8(arg2), \T5
1104 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1105 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1107 vpshufd $0b01001110, \T2, \T6
1110 vmovdqu HashKey_8_k(arg2), \T5
1111 vpclmulqdq $0x00, \T5, \T6, \T6
1113 vmovdqu 16*3(arg1), \T1
1114 vaesenc \T1, \XMM1, \XMM1
1115 vaesenc \T1, \XMM2, \XMM2
1116 vaesenc \T1, \XMM3, \XMM3
1117 vaesenc \T1, \XMM4, \XMM4
1118 vaesenc \T1, \XMM5, \XMM5
1119 vaesenc \T1, \XMM6, \XMM6
1120 vaesenc \T1, \XMM7, \XMM7
1121 vaesenc \T1, \XMM8, \XMM8
1123 vmovdqa TMP2(%rsp), \T1
1124 vmovdqu HashKey_7(arg2), \T5
1125 vpclmulqdq $0x11, \T5, \T1, \T3
1127 vpclmulqdq $0x00, \T5, \T1, \T3
1130 vpshufd $0b01001110, \T1, \T3
1132 vmovdqu HashKey_7_k(arg2), \T5
1133 vpclmulqdq $0x10, \T5, \T3, \T3
1136 vmovdqu 16*4(arg1), \T1
1137 vaesenc \T1, \XMM1, \XMM1
1138 vaesenc \T1, \XMM2, \XMM2
1139 vaesenc \T1, \XMM3, \XMM3
1140 vaesenc \T1, \XMM4, \XMM4
1141 vaesenc \T1, \XMM5, \XMM5
1142 vaesenc \T1, \XMM6, \XMM6
1143 vaesenc \T1, \XMM7, \XMM7
1144 vaesenc \T1, \XMM8, \XMM8
1146 #######################################################################
1148 vmovdqa TMP3(%rsp), \T1
1149 vmovdqu HashKey_6(arg2), \T5
1150 vpclmulqdq $0x11, \T5, \T1, \T3
1152 vpclmulqdq $0x00, \T5, \T1, \T3
1155 vpshufd $0b01001110, \T1, \T3
1157 vmovdqu HashKey_6_k(arg2), \T5
1158 vpclmulqdq $0x10, \T5, \T3, \T3
1161 vmovdqu 16*5(arg1), \T1
1162 vaesenc \T1, \XMM1, \XMM1
1163 vaesenc \T1, \XMM2, \XMM2
1164 vaesenc \T1, \XMM3, \XMM3
1165 vaesenc \T1, \XMM4, \XMM4
1166 vaesenc \T1, \XMM5, \XMM5
1167 vaesenc \T1, \XMM6, \XMM6
1168 vaesenc \T1, \XMM7, \XMM7
1169 vaesenc \T1, \XMM8, \XMM8
1171 vmovdqa TMP4(%rsp), \T1
1172 vmovdqu HashKey_5(arg2), \T5
1173 vpclmulqdq $0x11, \T5, \T1, \T3
1175 vpclmulqdq $0x00, \T5, \T1, \T3
1178 vpshufd $0b01001110, \T1, \T3
1180 vmovdqu HashKey_5_k(arg2), \T5
1181 vpclmulqdq $0x10, \T5, \T3, \T3
1184 vmovdqu 16*6(arg1), \T1
1185 vaesenc \T1, \XMM1, \XMM1
1186 vaesenc \T1, \XMM2, \XMM2
1187 vaesenc \T1, \XMM3, \XMM3
1188 vaesenc \T1, \XMM4, \XMM4
1189 vaesenc \T1, \XMM5, \XMM5
1190 vaesenc \T1, \XMM6, \XMM6
1191 vaesenc \T1, \XMM7, \XMM7
1192 vaesenc \T1, \XMM8, \XMM8
1195 vmovdqa TMP5(%rsp), \T1
1196 vmovdqu HashKey_4(arg2), \T5
1197 vpclmulqdq $0x11, \T5, \T1, \T3
1199 vpclmulqdq $0x00, \T5, \T1, \T3
1202 vpshufd $0b01001110, \T1, \T3
1204 vmovdqu HashKey_4_k(arg2), \T5
1205 vpclmulqdq $0x10, \T5, \T3, \T3
1208 vmovdqu 16*7(arg1), \T1
1209 vaesenc \T1, \XMM1, \XMM1
1210 vaesenc \T1, \XMM2, \XMM2
1211 vaesenc \T1, \XMM3, \XMM3
1212 vaesenc \T1, \XMM4, \XMM4
1213 vaesenc \T1, \XMM5, \XMM5
1214 vaesenc \T1, \XMM6, \XMM6
1215 vaesenc \T1, \XMM7, \XMM7
1216 vaesenc \T1, \XMM8, \XMM8
1218 vmovdqa TMP6(%rsp), \T1
1219 vmovdqu HashKey_3(arg2), \T5
1220 vpclmulqdq $0x11, \T5, \T1, \T3
1222 vpclmulqdq $0x00, \T5, \T1, \T3
1225 vpshufd $0b01001110, \T1, \T3
1227 vmovdqu HashKey_3_k(arg2), \T5
1228 vpclmulqdq $0x10, \T5, \T3, \T3
1232 vmovdqu 16*8(arg1), \T1
1233 vaesenc \T1, \XMM1, \XMM1
1234 vaesenc \T1, \XMM2, \XMM2
1235 vaesenc \T1, \XMM3, \XMM3
1236 vaesenc \T1, \XMM4, \XMM4
1237 vaesenc \T1, \XMM5, \XMM5
1238 vaesenc \T1, \XMM6, \XMM6
1239 vaesenc \T1, \XMM7, \XMM7
1240 vaesenc \T1, \XMM8, \XMM8
1242 vmovdqa TMP7(%rsp), \T1
1243 vmovdqu HashKey_2(arg2), \T5
1244 vpclmulqdq $0x11, \T5, \T1, \T3
1246 vpclmulqdq $0x00, \T5, \T1, \T3
1249 vpshufd $0b01001110, \T1, \T3
1251 vmovdqu HashKey_2_k(arg2), \T5
1252 vpclmulqdq $0x10, \T5, \T3, \T3
1255 #######################################################################
1257 vmovdqu 16*9(arg1), \T5
1258 vaesenc \T5, \XMM1, \XMM1
1259 vaesenc \T5, \XMM2, \XMM2
1260 vaesenc \T5, \XMM3, \XMM3
1261 vaesenc \T5, \XMM4, \XMM4
1262 vaesenc \T5, \XMM5, \XMM5
1263 vaesenc \T5, \XMM6, \XMM6
1264 vaesenc \T5, \XMM7, \XMM7
1265 vaesenc \T5, \XMM8, \XMM8
1267 vmovdqa TMP8(%rsp), \T1
1268 vmovdqu HashKey(arg2), \T5
1269 vpclmulqdq $0x11, \T5, \T1, \T3
1271 vpclmulqdq $0x00, \T5, \T1, \T3
1274 vpshufd $0b01001110, \T1, \T3
1276 vmovdqu HashKey_k(arg2), \T5
1277 vpclmulqdq $0x10, \T5, \T3, \T3
1283 vmovdqu 16*10(arg1), \T5
1289 vaesenc \T5, \XMM1, \XMM1
1290 vaesenc \T5, \XMM2, \XMM2
1291 vaesenc \T5, \XMM3, \XMM3
1292 vaesenc \T5, \XMM4, \XMM4
1293 vaesenc \T5, \XMM5, \XMM5
1294 vaesenc \T5, \XMM6, \XMM6
1295 vaesenc \T5, \XMM7, \XMM7
1296 vaesenc \T5, \XMM8, \XMM8
1298 vmovdqu 16*i(arg1), \T5
1307 vpxor 16*i(arg4, %r11), \T5, \T2
1309 vaesenclast \T2, reg_j, reg_j
1311 vaesenclast \T2, reg_j, \T3
1312 vmovdqu 16*i(arg4, %r11), reg_j
1313 vmovdqu \T3, 16*i(arg3, %r11)
1319 #######################################################################
1322 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1323 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1325 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1329 #######################################################################
1330 #first phase of the reduction
1331 #######################################################################
1332 vpslld $31, \T7, \T2 # packed right shifting << 31
1333 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1334 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1336 vpxor \T3, \T2, \T2 # xor the shifted versions
1339 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1341 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1342 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1343 #######################################################################
1345 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1346 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1347 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1348 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1349 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1350 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1351 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1352 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1355 #######################################################################
1356 #second phase of the reduction
1357 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1358 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1359 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1360 vpxor \T3, \T2, \T2 # xor the shifted versions
1365 vpxor \T7, \T6, \T6 # the result is in T6
1366 #######################################################################
1368 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1369 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1370 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1371 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1372 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1373 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1374 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1375 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1378 vpxor \T6, \XMM1, \XMM1
1385 # GHASH the last 4 ciphertext blocks.
1386 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1391 vpshufd $0b01001110, \XMM1, \T2
1392 vpxor \XMM1, \T2, \T2
1393 vmovdqu HashKey_8(arg2), \T5
1394 vpclmulqdq $0x11, \T5, \XMM1, \T6
1395 vpclmulqdq $0x00, \T5, \XMM1, \T7
1397 vmovdqu HashKey_8_k(arg2), \T3
1398 vpclmulqdq $0x00, \T3, \T2, \XMM1
1400 ######################
1402 vpshufd $0b01001110, \XMM2, \T2
1403 vpxor \XMM2, \T2, \T2
1404 vmovdqu HashKey_7(arg2), \T5
1405 vpclmulqdq $0x11, \T5, \XMM2, \T4
1408 vpclmulqdq $0x00, \T5, \XMM2, \T4
1411 vmovdqu HashKey_7_k(arg2), \T3
1412 vpclmulqdq $0x00, \T3, \T2, \T2
1413 vpxor \T2, \XMM1, \XMM1
1415 ######################
1417 vpshufd $0b01001110, \XMM3, \T2
1418 vpxor \XMM3, \T2, \T2
1419 vmovdqu HashKey_6(arg2), \T5
1420 vpclmulqdq $0x11, \T5, \XMM3, \T4
1423 vpclmulqdq $0x00, \T5, \XMM3, \T4
1426 vmovdqu HashKey_6_k(arg2), \T3
1427 vpclmulqdq $0x00, \T3, \T2, \T2
1428 vpxor \T2, \XMM1, \XMM1
1430 ######################
1432 vpshufd $0b01001110, \XMM4, \T2
1433 vpxor \XMM4, \T2, \T2
1434 vmovdqu HashKey_5(arg2), \T5
1435 vpclmulqdq $0x11, \T5, \XMM4, \T4
1438 vpclmulqdq $0x00, \T5, \XMM4, \T4
1441 vmovdqu HashKey_5_k(arg2), \T3
1442 vpclmulqdq $0x00, \T3, \T2, \T2
1443 vpxor \T2, \XMM1, \XMM1
1445 ######################
1447 vpshufd $0b01001110, \XMM5, \T2
1448 vpxor \XMM5, \T2, \T2
1449 vmovdqu HashKey_4(arg2), \T5
1450 vpclmulqdq $0x11, \T5, \XMM5, \T4
1453 vpclmulqdq $0x00, \T5, \XMM5, \T4
1456 vmovdqu HashKey_4_k(arg2), \T3
1457 vpclmulqdq $0x00, \T3, \T2, \T2
1458 vpxor \T2, \XMM1, \XMM1
1460 ######################
1462 vpshufd $0b01001110, \XMM6, \T2
1463 vpxor \XMM6, \T2, \T2
1464 vmovdqu HashKey_3(arg2), \T5
1465 vpclmulqdq $0x11, \T5, \XMM6, \T4
1468 vpclmulqdq $0x00, \T5, \XMM6, \T4
1471 vmovdqu HashKey_3_k(arg2), \T3
1472 vpclmulqdq $0x00, \T3, \T2, \T2
1473 vpxor \T2, \XMM1, \XMM1
1475 ######################
1477 vpshufd $0b01001110, \XMM7, \T2
1478 vpxor \XMM7, \T2, \T2
1479 vmovdqu HashKey_2(arg2), \T5
1480 vpclmulqdq $0x11, \T5, \XMM7, \T4
1483 vpclmulqdq $0x00, \T5, \XMM7, \T4
1486 vmovdqu HashKey_2_k(arg2), \T3
1487 vpclmulqdq $0x00, \T3, \T2, \T2
1488 vpxor \T2, \XMM1, \XMM1
1490 ######################
1492 vpshufd $0b01001110, \XMM8, \T2
1493 vpxor \XMM8, \T2, \T2
1494 vmovdqu HashKey(arg2), \T5
1495 vpclmulqdq $0x11, \T5, \XMM8, \T4
1498 vpclmulqdq $0x00, \T5, \XMM8, \T4
1501 vmovdqu HashKey_k(arg2), \T3
1502 vpclmulqdq $0x00, \T3, \T2, \T2
1504 vpxor \T2, \XMM1, \XMM1
1505 vpxor \T6, \XMM1, \XMM1
1506 vpxor \T7, \XMM1, \T2
1511 vpslldq $8, \T2, \T4
1512 vpsrldq $8, \T2, \T2
1515 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1516 # the accumulated carry-less multiplications
1518 #######################################################################
1519 #first phase of the reduction
1520 vpslld $31, \T7, \T2 # packed right shifting << 31
1521 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1522 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1524 vpxor \T3, \T2, \T2 # xor the shifted versions
1527 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1529 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1530 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1531 #######################################################################
1534 #second phase of the reduction
1535 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1536 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1537 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1538 vpxor \T3, \T2, \T2 # xor the shifted versions
1543 vpxor \T7, \T6, \T6 # the result is in T6
1547 #############################################################
1548 #void aesni_gcm_precomp_avx_gen2
1549 # (gcm_data *my_ctx_data,
1550 # gcm_context_data *data,
1551 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1552 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1553 # (from Security Association) concatenated with 8 byte
1554 # Initialisation Vector (from IPSec ESP Payload)
1555 # concatenated with 0x00000001. 16-byte aligned pointer. */
1556 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1557 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1558 #############################################################
1559 ENTRY(aesni_gcm_precomp_avx_gen2)
1562 vmovdqu (arg3), %xmm6 # xmm6 = HashKey
1564 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1565 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1566 vmovdqa %xmm6, %xmm2
1567 vpsllq $1, %xmm6, %xmm6
1568 vpsrlq $63, %xmm2, %xmm2
1569 vmovdqa %xmm2, %xmm1
1570 vpslldq $8, %xmm2, %xmm2
1571 vpsrldq $8, %xmm1, %xmm1
1572 vpor %xmm2, %xmm6, %xmm6
1574 vpshufd $0b00100100, %xmm1, %xmm2
1575 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1576 vpand POLY(%rip), %xmm2, %xmm2
1577 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1578 #######################################################################
1579 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
1582 CALC_AAD_HASH GHASH_MUL_AVX, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
1584 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1588 ENDPROC(aesni_gcm_precomp_avx_gen2)
1590 ###############################################################################
1591 #void aesni_gcm_enc_avx_gen2(
1592 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1593 # gcm_context_data *data,
1594 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1595 # const u8 *in, /* Plaintext input */
1596 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1597 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1598 # (from Security Association) concatenated with 8 byte
1599 # Initialisation Vector (from IPSec ESP Payload)
1600 # concatenated with 0x00000001. 16-byte aligned pointer. */
1601 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1602 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1603 # u8 *auth_tag, /* Authenticated Tag output. */
1604 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1605 # Valid values are 16 (most likely), 12 or 8. */
1606 ###############################################################################
1607 ENTRY(aesni_gcm_enc_avx_gen2)
1615 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1619 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1623 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1626 ENDPROC(aesni_gcm_enc_avx_gen2)
1628 ###############################################################################
1629 #void aesni_gcm_dec_avx_gen2(
1630 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1631 # gcm_context_data *data,
1632 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1633 # const u8 *in, /* Ciphertext input */
1634 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1635 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1636 # (from Security Association) concatenated with 8 byte
1637 # Initialisation Vector (from IPSec ESP Payload)
1638 # concatenated with 0x00000001. 16-byte aligned pointer. */
1639 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1640 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1641 # u8 *auth_tag, /* Authenticated Tag output. */
1642 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1643 # Valid values are 16 (most likely), 12 or 8. */
1644 ###############################################################################
1645 ENTRY(aesni_gcm_dec_avx_gen2)
1653 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1657 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1661 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1664 ENDPROC(aesni_gcm_dec_avx_gen2)
1665 #endif /* CONFIG_AS_AVX */
1667 #ifdef CONFIG_AS_AVX2
1668 ###############################################################################
1669 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1670 # Input: A and B (128-bits each, bit-reflected)
1671 # Output: C = A*B*x mod poly, (i.e. >>1 )
1672 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1673 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1674 ###############################################################################
1675 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1677 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1678 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1679 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1680 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1684 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1685 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1690 #######################################################################
1691 #first phase of the reduction
1692 vmovdqa POLY2(%rip), \T3
1694 vpclmulqdq $0x01, \GH, \T3, \T2
1695 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1697 vpxor \T2, \GH, \GH # first phase of the reduction complete
1698 #######################################################################
1699 #second phase of the reduction
1700 vpclmulqdq $0x00, \GH, \T3, \T2
1701 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1703 vpclmulqdq $0x10, \GH, \T3, \GH
1704 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1706 vpxor \T2, \GH, \GH # second phase of the reduction complete
1707 #######################################################################
1708 vpxor \T1, \GH, \GH # the result is in GH
1713 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1715 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1717 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1718 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1720 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1721 vmovdqu \T5, HashKey_3(arg2)
1723 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1724 vmovdqu \T5, HashKey_4(arg2)
1726 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1727 vmovdqu \T5, HashKey_5(arg2)
1729 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1730 vmovdqu \T5, HashKey_6(arg2)
1732 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1733 vmovdqu \T5, HashKey_7(arg2)
1735 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1736 vmovdqu \T5, HashKey_8(arg2)
1740 ## if a = number of total plaintext bytes
1742 ## num_initial_blocks = b mod 4#
1743 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1744 ## r10, r11, r12, rax are clobbered
1745 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1747 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1748 i = (8-\num_initial_blocks)
1750 vmovdqu AadHash(arg2), reg_i
1752 # initialize the data pointer offset as zero
1755 # start AES for num_initial_blocks blocks
1756 mov arg6, %rax # rax = *Y0
1757 vmovdqu (%rax), \CTR # CTR = Y0
1758 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1761 i = (9-\num_initial_blocks)
1763 .rep \num_initial_blocks
1764 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1766 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1771 vmovdqa (arg1), \T_key
1772 i = (9-\num_initial_blocks)
1774 .rep \num_initial_blocks
1775 vpxor \T_key, reg_i, reg_i
1783 vmovdqa 16*j(arg1), \T_key
1784 i = (9-\num_initial_blocks)
1786 .rep \num_initial_blocks
1787 vaesenc \T_key, reg_i, reg_i
1797 vmovdqa 16*j(arg1), \T_key
1798 i = (9-\num_initial_blocks)
1800 .rep \num_initial_blocks
1801 vaesenclast \T_key, reg_i, reg_i
1806 i = (9-\num_initial_blocks)
1808 .rep \num_initial_blocks
1809 vmovdqu (arg4, %r11), \T1
1810 vpxor \T1, reg_i, reg_i
1811 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1812 # num_initial_blocks blocks
1817 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1823 i = (8-\num_initial_blocks)
1824 j = (9-\num_initial_blocks)
1827 .rep \num_initial_blocks
1828 vpxor reg_i, reg_j, reg_j
1829 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1834 # XMM8 has the combined result here
1836 vmovdqa \XMM8, TMP1(%rsp)
1840 jl _initial_blocks_done\@ # no need for precomputed constants
1842 ###############################################################################
1843 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1844 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1846 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1848 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1850 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1852 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1854 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1856 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1858 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1860 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1862 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1864 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1866 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1868 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1870 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1872 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1874 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1876 vmovdqa (arg1), \T_key
1877 vpxor \T_key, \XMM1, \XMM1
1878 vpxor \T_key, \XMM2, \XMM2
1879 vpxor \T_key, \XMM3, \XMM3
1880 vpxor \T_key, \XMM4, \XMM4
1881 vpxor \T_key, \XMM5, \XMM5
1882 vpxor \T_key, \XMM6, \XMM6
1883 vpxor \T_key, \XMM7, \XMM7
1884 vpxor \T_key, \XMM8, \XMM8
1888 .rep \REP # do REP rounds
1889 vmovdqa 16*i(arg1), \T_key
1890 vaesenc \T_key, \XMM1, \XMM1
1891 vaesenc \T_key, \XMM2, \XMM2
1892 vaesenc \T_key, \XMM3, \XMM3
1893 vaesenc \T_key, \XMM4, \XMM4
1894 vaesenc \T_key, \XMM5, \XMM5
1895 vaesenc \T_key, \XMM6, \XMM6
1896 vaesenc \T_key, \XMM7, \XMM7
1897 vaesenc \T_key, \XMM8, \XMM8
1903 vmovdqa 16*i(arg1), \T_key
1904 vaesenclast \T_key, \XMM1, \XMM1
1905 vaesenclast \T_key, \XMM2, \XMM2
1906 vaesenclast \T_key, \XMM3, \XMM3
1907 vaesenclast \T_key, \XMM4, \XMM4
1908 vaesenclast \T_key, \XMM5, \XMM5
1909 vaesenclast \T_key, \XMM6, \XMM6
1910 vaesenclast \T_key, \XMM7, \XMM7
1911 vaesenclast \T_key, \XMM8, \XMM8
1913 vmovdqu (arg4, %r11), \T1
1914 vpxor \T1, \XMM1, \XMM1
1915 vmovdqu \XMM1, (arg3 , %r11)
1920 vmovdqu 16*1(arg4, %r11), \T1
1921 vpxor \T1, \XMM2, \XMM2
1922 vmovdqu \XMM2, 16*1(arg3 , %r11)
1927 vmovdqu 16*2(arg4, %r11), \T1
1928 vpxor \T1, \XMM3, \XMM3
1929 vmovdqu \XMM3, 16*2(arg3 , %r11)
1934 vmovdqu 16*3(arg4, %r11), \T1
1935 vpxor \T1, \XMM4, \XMM4
1936 vmovdqu \XMM4, 16*3(arg3 , %r11)
1941 vmovdqu 16*4(arg4, %r11), \T1
1942 vpxor \T1, \XMM5, \XMM5
1943 vmovdqu \XMM5, 16*4(arg3 , %r11)
1948 vmovdqu 16*5(arg4, %r11), \T1
1949 vpxor \T1, \XMM6, \XMM6
1950 vmovdqu \XMM6, 16*5(arg3 , %r11)
1955 vmovdqu 16*6(arg4, %r11), \T1
1956 vpxor \T1, \XMM7, \XMM7
1957 vmovdqu \XMM7, 16*6(arg3 , %r11)
1962 vmovdqu 16*7(arg4, %r11), \T1
1963 vpxor \T1, \XMM8, \XMM8
1964 vmovdqu \XMM8, 16*7(arg3 , %r11)
1971 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1972 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1973 # the corresponding ciphertext
1974 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1975 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1976 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1977 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1978 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1979 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1980 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1982 ###############################################################################
1984 _initial_blocks_done\@:
1991 # encrypt 8 blocks at a time
1992 # ghash the 8 previously encrypted ciphertext blocks
1993 # arg1, arg3, arg4 are used as pointers only, not modified
1994 # r11 is the data offset value
1995 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1998 vmovdqa \XMM2, TMP2(%rsp)
1999 vmovdqa \XMM3, TMP3(%rsp)
2000 vmovdqa \XMM4, TMP4(%rsp)
2001 vmovdqa \XMM5, TMP5(%rsp)
2002 vmovdqa \XMM6, TMP6(%rsp)
2003 vmovdqa \XMM7, TMP7(%rsp)
2004 vmovdqa \XMM8, TMP8(%rsp)
2006 .if \loop_idx == in_order
2007 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2008 vpaddd ONE(%rip), \XMM1, \XMM2
2009 vpaddd ONE(%rip), \XMM2, \XMM3
2010 vpaddd ONE(%rip), \XMM3, \XMM4
2011 vpaddd ONE(%rip), \XMM4, \XMM5
2012 vpaddd ONE(%rip), \XMM5, \XMM6
2013 vpaddd ONE(%rip), \XMM6, \XMM7
2014 vpaddd ONE(%rip), \XMM7, \XMM8
2017 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2018 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2019 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2020 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2021 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2022 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2023 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2024 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2026 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2027 vpaddd ONEf(%rip), \XMM1, \XMM2
2028 vpaddd ONEf(%rip), \XMM2, \XMM3
2029 vpaddd ONEf(%rip), \XMM3, \XMM4
2030 vpaddd ONEf(%rip), \XMM4, \XMM5
2031 vpaddd ONEf(%rip), \XMM5, \XMM6
2032 vpaddd ONEf(%rip), \XMM6, \XMM7
2033 vpaddd ONEf(%rip), \XMM7, \XMM8
2038 #######################################################################
2041 vpxor \T1, \XMM1, \XMM1
2042 vpxor \T1, \XMM2, \XMM2
2043 vpxor \T1, \XMM3, \XMM3
2044 vpxor \T1, \XMM4, \XMM4
2045 vpxor \T1, \XMM5, \XMM5
2046 vpxor \T1, \XMM6, \XMM6
2047 vpxor \T1, \XMM7, \XMM7
2048 vpxor \T1, \XMM8, \XMM8
2050 #######################################################################
2056 vmovdqu 16*1(arg1), \T1
2057 vaesenc \T1, \XMM1, \XMM1
2058 vaesenc \T1, \XMM2, \XMM2
2059 vaesenc \T1, \XMM3, \XMM3
2060 vaesenc \T1, \XMM4, \XMM4
2061 vaesenc \T1, \XMM5, \XMM5
2062 vaesenc \T1, \XMM6, \XMM6
2063 vaesenc \T1, \XMM7, \XMM7
2064 vaesenc \T1, \XMM8, \XMM8
2066 vmovdqu 16*2(arg1), \T1
2067 vaesenc \T1, \XMM1, \XMM1
2068 vaesenc \T1, \XMM2, \XMM2
2069 vaesenc \T1, \XMM3, \XMM3
2070 vaesenc \T1, \XMM4, \XMM4
2071 vaesenc \T1, \XMM5, \XMM5
2072 vaesenc \T1, \XMM6, \XMM6
2073 vaesenc \T1, \XMM7, \XMM7
2074 vaesenc \T1, \XMM8, \XMM8
2077 #######################################################################
2079 vmovdqu HashKey_8(arg2), \T5
2080 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2081 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2082 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2083 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2086 vmovdqu 16*3(arg1), \T1
2087 vaesenc \T1, \XMM1, \XMM1
2088 vaesenc \T1, \XMM2, \XMM2
2089 vaesenc \T1, \XMM3, \XMM3
2090 vaesenc \T1, \XMM4, \XMM4
2091 vaesenc \T1, \XMM5, \XMM5
2092 vaesenc \T1, \XMM6, \XMM6
2093 vaesenc \T1, \XMM7, \XMM7
2094 vaesenc \T1, \XMM8, \XMM8
2096 vmovdqa TMP2(%rsp), \T1
2097 vmovdqu HashKey_7(arg2), \T5
2098 vpclmulqdq $0x11, \T5, \T1, \T3
2101 vpclmulqdq $0x00, \T5, \T1, \T3
2104 vpclmulqdq $0x01, \T5, \T1, \T3
2107 vpclmulqdq $0x10, \T5, \T1, \T3
2110 vmovdqu 16*4(arg1), \T1
2111 vaesenc \T1, \XMM1, \XMM1
2112 vaesenc \T1, \XMM2, \XMM2
2113 vaesenc \T1, \XMM3, \XMM3
2114 vaesenc \T1, \XMM4, \XMM4
2115 vaesenc \T1, \XMM5, \XMM5
2116 vaesenc \T1, \XMM6, \XMM6
2117 vaesenc \T1, \XMM7, \XMM7
2118 vaesenc \T1, \XMM8, \XMM8
2120 #######################################################################
2122 vmovdqa TMP3(%rsp), \T1
2123 vmovdqu HashKey_6(arg2), \T5
2124 vpclmulqdq $0x11, \T5, \T1, \T3
2127 vpclmulqdq $0x00, \T5, \T1, \T3
2130 vpclmulqdq $0x01, \T5, \T1, \T3
2133 vpclmulqdq $0x10, \T5, \T1, \T3
2136 vmovdqu 16*5(arg1), \T1
2137 vaesenc \T1, \XMM1, \XMM1
2138 vaesenc \T1, \XMM2, \XMM2
2139 vaesenc \T1, \XMM3, \XMM3
2140 vaesenc \T1, \XMM4, \XMM4
2141 vaesenc \T1, \XMM5, \XMM5
2142 vaesenc \T1, \XMM6, \XMM6
2143 vaesenc \T1, \XMM7, \XMM7
2144 vaesenc \T1, \XMM8, \XMM8
2146 vmovdqa TMP4(%rsp), \T1
2147 vmovdqu HashKey_5(arg2), \T5
2148 vpclmulqdq $0x11, \T5, \T1, \T3
2151 vpclmulqdq $0x00, \T5, \T1, \T3
2154 vpclmulqdq $0x01, \T5, \T1, \T3
2157 vpclmulqdq $0x10, \T5, \T1, \T3
2160 vmovdqu 16*6(arg1), \T1
2161 vaesenc \T1, \XMM1, \XMM1
2162 vaesenc \T1, \XMM2, \XMM2
2163 vaesenc \T1, \XMM3, \XMM3
2164 vaesenc \T1, \XMM4, \XMM4
2165 vaesenc \T1, \XMM5, \XMM5
2166 vaesenc \T1, \XMM6, \XMM6
2167 vaesenc \T1, \XMM7, \XMM7
2168 vaesenc \T1, \XMM8, \XMM8
2171 vmovdqa TMP5(%rsp), \T1
2172 vmovdqu HashKey_4(arg2), \T5
2173 vpclmulqdq $0x11, \T5, \T1, \T3
2176 vpclmulqdq $0x00, \T5, \T1, \T3
2179 vpclmulqdq $0x01, \T5, \T1, \T3
2182 vpclmulqdq $0x10, \T5, \T1, \T3
2185 vmovdqu 16*7(arg1), \T1
2186 vaesenc \T1, \XMM1, \XMM1
2187 vaesenc \T1, \XMM2, \XMM2
2188 vaesenc \T1, \XMM3, \XMM3
2189 vaesenc \T1, \XMM4, \XMM4
2190 vaesenc \T1, \XMM5, \XMM5
2191 vaesenc \T1, \XMM6, \XMM6
2192 vaesenc \T1, \XMM7, \XMM7
2193 vaesenc \T1, \XMM8, \XMM8
2195 vmovdqa TMP6(%rsp), \T1
2196 vmovdqu HashKey_3(arg2), \T5
2197 vpclmulqdq $0x11, \T5, \T1, \T3
2200 vpclmulqdq $0x00, \T5, \T1, \T3
2203 vpclmulqdq $0x01, \T5, \T1, \T3
2206 vpclmulqdq $0x10, \T5, \T1, \T3
2209 vmovdqu 16*8(arg1), \T1
2210 vaesenc \T1, \XMM1, \XMM1
2211 vaesenc \T1, \XMM2, \XMM2
2212 vaesenc \T1, \XMM3, \XMM3
2213 vaesenc \T1, \XMM4, \XMM4
2214 vaesenc \T1, \XMM5, \XMM5
2215 vaesenc \T1, \XMM6, \XMM6
2216 vaesenc \T1, \XMM7, \XMM7
2217 vaesenc \T1, \XMM8, \XMM8
2219 vmovdqa TMP7(%rsp), \T1
2220 vmovdqu HashKey_2(arg2), \T5
2221 vpclmulqdq $0x11, \T5, \T1, \T3
2224 vpclmulqdq $0x00, \T5, \T1, \T3
2227 vpclmulqdq $0x01, \T5, \T1, \T3
2230 vpclmulqdq $0x10, \T5, \T1, \T3
2234 #######################################################################
2236 vmovdqu 16*9(arg1), \T5
2237 vaesenc \T5, \XMM1, \XMM1
2238 vaesenc \T5, \XMM2, \XMM2
2239 vaesenc \T5, \XMM3, \XMM3
2240 vaesenc \T5, \XMM4, \XMM4
2241 vaesenc \T5, \XMM5, \XMM5
2242 vaesenc \T5, \XMM6, \XMM6
2243 vaesenc \T5, \XMM7, \XMM7
2244 vaesenc \T5, \XMM8, \XMM8
2246 vmovdqa TMP8(%rsp), \T1
2247 vmovdqu HashKey(arg2), \T5
2249 vpclmulqdq $0x00, \T5, \T1, \T3
2252 vpclmulqdq $0x01, \T5, \T1, \T3
2255 vpclmulqdq $0x10, \T5, \T1, \T3
2258 vpclmulqdq $0x11, \T5, \T1, \T3
2262 vmovdqu 16*10(arg1), \T5
2267 vaesenc \T5, \XMM1, \XMM1
2268 vaesenc \T5, \XMM2, \XMM2
2269 vaesenc \T5, \XMM3, \XMM3
2270 vaesenc \T5, \XMM4, \XMM4
2271 vaesenc \T5, \XMM5, \XMM5
2272 vaesenc \T5, \XMM6, \XMM6
2273 vaesenc \T5, \XMM7, \XMM7
2274 vaesenc \T5, \XMM8, \XMM8
2276 vmovdqu 16*i(arg1), \T5
2285 vpxor 16*i(arg4, %r11), \T5, \T2
2287 vaesenclast \T2, reg_j, reg_j
2289 vaesenclast \T2, reg_j, \T3
2290 vmovdqu 16*i(arg4, %r11), reg_j
2291 vmovdqu \T3, 16*i(arg3, %r11)
2297 #######################################################################
2300 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2301 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2303 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2307 #######################################################################
2308 #first phase of the reduction
2309 vmovdqa POLY2(%rip), \T3
2311 vpclmulqdq $0x01, \T7, \T3, \T2
2312 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2314 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2315 #######################################################################
2317 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2318 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2319 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2320 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2321 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2322 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2323 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2324 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2327 #######################################################################
2328 #second phase of the reduction
2329 vpclmulqdq $0x00, \T7, \T3, \T2
2330 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2332 vpclmulqdq $0x10, \T7, \T3, \T4
2333 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2335 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2336 #######################################################################
2337 vpxor \T4, \T1, \T1 # the result is in T1
2339 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2340 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2341 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2342 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2343 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2344 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2345 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2346 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2349 vpxor \T1, \XMM1, \XMM1
2356 # GHASH the last 4 ciphertext blocks.
2357 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2361 vmovdqu HashKey_8(arg2), \T5
2363 vpshufd $0b01001110, \XMM1, \T2
2364 vpshufd $0b01001110, \T5, \T3
2365 vpxor \XMM1, \T2, \T2
2368 vpclmulqdq $0x11, \T5, \XMM1, \T6
2369 vpclmulqdq $0x00, \T5, \XMM1, \T7
2371 vpclmulqdq $0x00, \T3, \T2, \XMM1
2373 ######################
2375 vmovdqu HashKey_7(arg2), \T5
2376 vpshufd $0b01001110, \XMM2, \T2
2377 vpshufd $0b01001110, \T5, \T3
2378 vpxor \XMM2, \T2, \T2
2381 vpclmulqdq $0x11, \T5, \XMM2, \T4
2384 vpclmulqdq $0x00, \T5, \XMM2, \T4
2387 vpclmulqdq $0x00, \T3, \T2, \T2
2389 vpxor \T2, \XMM1, \XMM1
2391 ######################
2393 vmovdqu HashKey_6(arg2), \T5
2394 vpshufd $0b01001110, \XMM3, \T2
2395 vpshufd $0b01001110, \T5, \T3
2396 vpxor \XMM3, \T2, \T2
2399 vpclmulqdq $0x11, \T5, \XMM3, \T4
2402 vpclmulqdq $0x00, \T5, \XMM3, \T4
2405 vpclmulqdq $0x00, \T3, \T2, \T2
2407 vpxor \T2, \XMM1, \XMM1
2409 ######################
2411 vmovdqu HashKey_5(arg2), \T5
2412 vpshufd $0b01001110, \XMM4, \T2
2413 vpshufd $0b01001110, \T5, \T3
2414 vpxor \XMM4, \T2, \T2
2417 vpclmulqdq $0x11, \T5, \XMM4, \T4
2420 vpclmulqdq $0x00, \T5, \XMM4, \T4
2423 vpclmulqdq $0x00, \T3, \T2, \T2
2425 vpxor \T2, \XMM1, \XMM1
2427 ######################
2429 vmovdqu HashKey_4(arg2), \T5
2430 vpshufd $0b01001110, \XMM5, \T2
2431 vpshufd $0b01001110, \T5, \T3
2432 vpxor \XMM5, \T2, \T2
2435 vpclmulqdq $0x11, \T5, \XMM5, \T4
2438 vpclmulqdq $0x00, \T5, \XMM5, \T4
2441 vpclmulqdq $0x00, \T3, \T2, \T2
2443 vpxor \T2, \XMM1, \XMM1
2445 ######################
2447 vmovdqu HashKey_3(arg2), \T5
2448 vpshufd $0b01001110, \XMM6, \T2
2449 vpshufd $0b01001110, \T5, \T3
2450 vpxor \XMM6, \T2, \T2
2453 vpclmulqdq $0x11, \T5, \XMM6, \T4
2456 vpclmulqdq $0x00, \T5, \XMM6, \T4
2459 vpclmulqdq $0x00, \T3, \T2, \T2
2461 vpxor \T2, \XMM1, \XMM1
2463 ######################
2465 vmovdqu HashKey_2(arg2), \T5
2466 vpshufd $0b01001110, \XMM7, \T2
2467 vpshufd $0b01001110, \T5, \T3
2468 vpxor \XMM7, \T2, \T2
2471 vpclmulqdq $0x11, \T5, \XMM7, \T4
2474 vpclmulqdq $0x00, \T5, \XMM7, \T4
2477 vpclmulqdq $0x00, \T3, \T2, \T2
2479 vpxor \T2, \XMM1, \XMM1
2481 ######################
2483 vmovdqu HashKey(arg2), \T5
2484 vpshufd $0b01001110, \XMM8, \T2
2485 vpshufd $0b01001110, \T5, \T3
2486 vpxor \XMM8, \T2, \T2
2489 vpclmulqdq $0x11, \T5, \XMM8, \T4
2492 vpclmulqdq $0x00, \T5, \XMM8, \T4
2495 vpclmulqdq $0x00, \T3, \T2, \T2
2497 vpxor \T2, \XMM1, \XMM1
2498 vpxor \T6, \XMM1, \XMM1
2499 vpxor \T7, \XMM1, \T2
2504 vpslldq $8, \T2, \T4
2505 vpsrldq $8, \T2, \T2
2508 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2509 # accumulated carry-less multiplications
2511 #######################################################################
2512 #first phase of the reduction
2513 vmovdqa POLY2(%rip), \T3
2515 vpclmulqdq $0x01, \T7, \T3, \T2
2516 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2518 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2519 #######################################################################
2522 #second phase of the reduction
2523 vpclmulqdq $0x00, \T7, \T3, \T2
2524 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2526 vpclmulqdq $0x10, \T7, \T3, \T4
2527 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2529 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2530 #######################################################################
2531 vpxor \T4, \T6, \T6 # the result is in T6
2536 #############################################################
2537 #void aesni_gcm_precomp_avx_gen4
2538 # (gcm_data *my_ctx_data,
2539 # gcm_context_data *data,
2540 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2541 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2542 # (from Security Association) concatenated with 8 byte
2543 # Initialisation Vector (from IPSec ESP Payload)
2544 # concatenated with 0x00000001. 16-byte aligned pointer. */
2545 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2546 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2547 #############################################################
2548 ENTRY(aesni_gcm_precomp_avx_gen4)
2551 vmovdqu (arg3), %xmm6 # xmm6 = HashKey
2553 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2554 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2555 vmovdqa %xmm6, %xmm2
2556 vpsllq $1, %xmm6, %xmm6
2557 vpsrlq $63, %xmm2, %xmm2
2558 vmovdqa %xmm2, %xmm1
2559 vpslldq $8, %xmm2, %xmm2
2560 vpsrldq $8, %xmm1, %xmm1
2561 vpor %xmm2, %xmm6, %xmm6
2563 vpshufd $0b00100100, %xmm1, %xmm2
2564 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2565 vpand POLY(%rip), %xmm2, %xmm2
2566 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2567 #######################################################################
2568 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
2570 CALC_AAD_HASH GHASH_MUL_AVX2, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
2572 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2576 ENDPROC(aesni_gcm_precomp_avx_gen4)
2579 ###############################################################################
2580 #void aesni_gcm_enc_avx_gen4(
2581 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2582 # gcm_context_data *data,
2583 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2584 # const u8 *in, /* Plaintext input */
2585 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2586 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2587 # (from Security Association) concatenated with 8 byte
2588 # Initialisation Vector (from IPSec ESP Payload)
2589 # concatenated with 0x00000001. 16-byte aligned pointer. */
2590 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2591 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2592 # u8 *auth_tag, /* Authenticated Tag output. */
2593 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2594 # Valid values are 16 (most likely), 12 or 8. */
2595 ###############################################################################
2596 ENTRY(aesni_gcm_enc_avx_gen4)
2604 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2608 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2612 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2615 ENDPROC(aesni_gcm_enc_avx_gen4)
2617 ###############################################################################
2618 #void aesni_gcm_dec_avx_gen4(
2619 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2620 # gcm_context_data *data,
2621 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2622 # const u8 *in, /* Ciphertext input */
2623 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2624 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2625 # (from Security Association) concatenated with 8 byte
2626 # Initialisation Vector (from IPSec ESP Payload)
2627 # concatenated with 0x00000001. 16-byte aligned pointer. */
2628 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2629 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2630 # u8 *auth_tag, /* Authenticated Tag output. */
2631 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2632 # Valid values are 16 (most likely), 12 or 8. */
2633 ###############################################################################
2634 ENTRY(aesni_gcm_dec_avx_gen4)
2642 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2646 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2650 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2653 ENDPROC(aesni_gcm_dec_avx_gen4)
2655 #endif /* CONFIG_AS_AVX2 */