1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
185 ##define the fields of the gcm aes context
187 # u8 expanded_keys[16*11] store expanded keys
188 # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
189 # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
190 # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
191 # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
192 # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
193 # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
194 # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
195 # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
196 # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
197 # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
198 # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
199 # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
200 # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
201 # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
202 # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
203 # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
206 HashKey = 16*11 # store HashKey <<1 mod poly here
207 HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
208 HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
209 HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
210 HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
211 HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
212 HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
213 HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
214 HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
215 HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
216 HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
217 HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
218 HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
219 HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
220 HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
221 HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
229 #define arg7 STACK_OFFSET+8*1(%r14)
230 #define arg8 STACK_OFFSET+8*2(%r14)
231 #define arg9 STACK_OFFSET+8*3(%r14)
241 .macro define_reg r n
252 # need to push 4 registers into stack to maintain
255 TMP1 = 16*0 # Temporary storage for AAD
256 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
257 TMP3 = 16*2 # Temporary storage for AES State 3
258 TMP4 = 16*3 # Temporary storage for AES State 4
259 TMP5 = 16*4 # Temporary storage for AES State 5
260 TMP6 = 16*5 # Temporary storage for AES State 6
261 TMP7 = 16*6 # Temporary storage for AES State 7
262 TMP8 = 16*7 # Temporary storage for AES State 8
264 VARIABLE_OFFSET = 16*8
266 ################################
268 ################################
270 # Encryption of a single block
271 .macro ENCRYPT_SINGLE_BLOCK XMM0
272 vpxor (arg1), \XMM0, \XMM0
276 vaesenc 16*i(arg1), \XMM0, \XMM0
280 vaesenclast 16*10(arg1), \XMM0, \XMM0
283 # combined for GCM encrypt and decrypt functions
284 # clobbering all xmm registers
285 # clobbering r10, r11, r12, r13, r14, r15
286 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC
288 #the number of pushes must equal STACK_OFFSET
299 sub $VARIABLE_OFFSET, %rsp
300 and $~63, %rsp # align rsp to 64 bytes
303 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
305 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
306 and $-16, %r13 # r13 = r13 - (r13 mod 16)
311 jz _initial_num_blocks_is_0\@
314 je _initial_num_blocks_is_7\@
316 je _initial_num_blocks_is_6\@
318 je _initial_num_blocks_is_5\@
320 je _initial_num_blocks_is_4\@
322 je _initial_num_blocks_is_3\@
324 je _initial_num_blocks_is_2\@
326 jmp _initial_num_blocks_is_1\@
328 _initial_num_blocks_is_7\@:
329 \INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
331 jmp _initial_blocks_encrypted\@
333 _initial_num_blocks_is_6\@:
334 \INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
336 jmp _initial_blocks_encrypted\@
338 _initial_num_blocks_is_5\@:
339 \INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
341 jmp _initial_blocks_encrypted\@
343 _initial_num_blocks_is_4\@:
344 \INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
346 jmp _initial_blocks_encrypted\@
348 _initial_num_blocks_is_3\@:
349 \INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
351 jmp _initial_blocks_encrypted\@
353 _initial_num_blocks_is_2\@:
354 \INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
356 jmp _initial_blocks_encrypted\@
358 _initial_num_blocks_is_1\@:
359 \INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
361 jmp _initial_blocks_encrypted\@
363 _initial_num_blocks_is_0\@:
364 \INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
367 _initial_blocks_encrypted\@:
369 je _zero_cipher_left\@
372 je _eight_cipher_left\@
379 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
389 \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
392 jne _encrypt_by_8_new\@
394 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
395 jmp _eight_cipher_left\@
398 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 \GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
401 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
404 jne _encrypt_by_8_new\@
406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
411 _eight_cipher_left\@:
412 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
417 jl _only_less_than_16\@
420 and $15, %r13 # r13 = (arg4 mod 16)
422 je _multiple_of_16_bytes\@
424 # handle the last <16 Byte block seperately
427 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
428 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
429 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
433 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
435 lea SHIFT_MASK+16(%rip), %r12
436 sub %r13, %r12 # adjust the shuffle mask pointer to be
437 # able to shift 16-r13 bytes (r13 is the
438 # number of bytes in plaintext mod 16)
439 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
440 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
441 jmp _final_ghash_mul\@
443 _only_less_than_16\@:
446 and $15, %r13 # r13 = (arg4 mod 16)
448 je _multiple_of_16_bytes\@
450 # handle the last <16 Byte block separately
453 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
454 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
455 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
458 lea SHIFT_MASK+16(%rip), %r12
459 sub %r13, %r12 # adjust the shuffle mask pointer to be
460 # able to shift 16-r13 bytes (r13 is the
461 # number of bytes in plaintext mod 16)
463 _get_last_16_byte_loop\@:
464 movb (arg3, %r11), %al
465 movb %al, TMP1 (%rsp , %r11)
468 jne _get_last_16_byte_loop\@
470 vmovdqu TMP1(%rsp), %xmm1
477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481 vpand %xmm1, %xmm2, %xmm2
482 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483 vpxor %xmm2, %xmm14, %xmm14
484 #GHASH computation for the last <16 Byte block
485 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
489 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
490 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
491 # mask out top 16-r13 bytes of xmm9
492 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
493 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
494 vpxor %xmm9, %xmm14, %xmm14
495 #GHASH computation for the last <16 Byte block
496 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
499 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
503 #############################
507 jle _less_than_8_bytes_left\@
509 mov %rax, (arg2 , %r11)
511 vpsrldq $8, %xmm9, %xmm9
515 _less_than_8_bytes_left\@:
516 movb %al, (arg2 , %r11)
520 jne _less_than_8_bytes_left\@
521 #############################
523 _multiple_of_16_bytes\@:
524 mov arg7, %r12 # r12 = aadLen (number of bytes)
525 shl $3, %r12 # convert into number of bits
526 vmovd %r12d, %xmm15 # len(A) in xmm15
528 shl $3, arg4 # len(C) in bits (*128)
530 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
531 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
533 vpxor %xmm15, %xmm14, %xmm14
534 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
535 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
537 mov arg5, %rax # rax = *Y0
538 vmovdqu (%rax), %xmm9 # xmm9 = Y0
540 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
542 vpxor %xmm14, %xmm9, %xmm9
547 mov arg8, %r10 # r10 = authTag
548 mov arg9, %r11 # r11 = auth_tag_len
561 vpsrldq $8, %xmm9, %xmm9
569 vpsrldq $4, %xmm9, %xmm9
586 vmovdqu %xmm9, (%r10)
598 ###############################################################################
599 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
600 # Input: A and B (128-bits each, bit-reflected)
601 # Output: C = A*B*x mod poly, (i.e. >>1 )
602 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
603 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
604 ###############################################################################
605 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
607 vpshufd $0b01001110, \GH, \T2
608 vpshufd $0b01001110, \HK, \T3
609 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
610 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
612 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
613 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
614 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
616 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
618 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
619 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
621 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
623 #first phase of the reduction
624 vpslld $31, \GH, \T2 # packed right shifting << 31
625 vpslld $30, \GH, \T3 # packed right shifting shift << 30
626 vpslld $25, \GH, \T4 # packed right shifting shift << 25
628 vpxor \T3, \T2, \T2 # xor the shifted versions
631 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
633 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
634 vpxor \T2, \GH, \GH # first phase of the reduction complete
636 #second phase of the reduction
638 vpsrld $1,\GH, \T2 # packed left shifting >> 1
639 vpsrld $2,\GH, \T3 # packed left shifting >> 2
640 vpsrld $7,\GH, \T4 # packed left shifting >> 7
641 vpxor \T3, \T2, \T2 # xor the shifted versions
646 vpxor \T1, \GH, \GH # the result is in GH
651 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
653 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
656 vpshufd $0b01001110, \T5, \T1
658 vmovdqa \T1, HashKey_k(arg1)
660 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
661 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
662 vpshufd $0b01001110, \T5, \T1
664 vmovdqa \T1, HashKey_2_k(arg1)
666 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
667 vmovdqa \T5, HashKey_3(arg1)
668 vpshufd $0b01001110, \T5, \T1
670 vmovdqa \T1, HashKey_3_k(arg1)
672 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
673 vmovdqa \T5, HashKey_4(arg1)
674 vpshufd $0b01001110, \T5, \T1
676 vmovdqa \T1, HashKey_4_k(arg1)
678 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
679 vmovdqa \T5, HashKey_5(arg1)
680 vpshufd $0b01001110, \T5, \T1
682 vmovdqa \T1, HashKey_5_k(arg1)
684 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
685 vmovdqa \T5, HashKey_6(arg1)
686 vpshufd $0b01001110, \T5, \T1
688 vmovdqa \T1, HashKey_6_k(arg1)
690 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
691 vmovdqa \T5, HashKey_7(arg1)
692 vpshufd $0b01001110, \T5, \T1
694 vmovdqa \T1, HashKey_7_k(arg1)
696 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
697 vmovdqa \T5, HashKey_8(arg1)
698 vpshufd $0b01001110, \T5, \T1
700 vmovdqa \T1, HashKey_8_k(arg1)
704 ## if a = number of total plaintext bytes
706 ## num_initial_blocks = b mod 4#
707 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
708 ## r10, r11, r12, rax are clobbered
709 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
711 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
712 i = (8-\num_initial_blocks)
716 mov arg6, %r10 # r10 = AAD
717 mov arg7, %r12 # r12 = aadLen
722 vpxor reg_j, reg_j, reg_j
723 vpxor reg_i, reg_i, reg_i
727 vmovdqu (%r10), reg_i
728 vpshufb SHUF_MASK(%rip), reg_i, reg_i
729 vpxor reg_i, reg_j, reg_j
730 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
735 jge _get_AAD_blocks\@
740 vpxor reg_i, reg_i, reg_i
742 /* read the last <16B of AAD. since we have at least 4B of
743 data right after the AAD (the ICV, and maybe some CT), we can
744 read 4B/8B blocks safely, and then get rid of the extra stuff */
752 vpsrldq $8, reg_i, reg_i
753 vpxor \T1, reg_i, reg_i
762 vpslldq $12, \T1, \T1
763 vpsrldq $4, reg_i, reg_i
764 vpxor \T1, reg_i, reg_i
766 /* finalize: shift out the extra bytes we read, and align
767 left. since pslldq can only shift by an immediate, we use
768 vpshufb and an array of shuffle masks */
771 movdqu aad_shift_arr(%r11), \T1
772 vpshufb \T1, reg_i, reg_i
773 _get_AAD_rest_final\@:
774 vpshufb SHUF_MASK(%rip), reg_i, reg_i
775 vpxor reg_j, reg_i, reg_i
776 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
779 # initialize the data pointer offset as zero
782 # start AES for num_initial_blocks blocks
783 mov arg5, %rax # rax = *Y0
784 vmovdqu (%rax), \CTR # CTR = Y0
785 vpshufb SHUF_MASK(%rip), \CTR, \CTR
788 i = (9-\num_initial_blocks)
790 .rep \num_initial_blocks
791 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
793 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
798 vmovdqa (arg1), \T_key
799 i = (9-\num_initial_blocks)
801 .rep \num_initial_blocks
802 vpxor \T_key, reg_i, reg_i
810 vmovdqa 16*j(arg1), \T_key
811 i = (9-\num_initial_blocks)
813 .rep \num_initial_blocks
814 vaesenc \T_key, reg_i, reg_i
824 vmovdqa 16*10(arg1), \T_key
825 i = (9-\num_initial_blocks)
827 .rep \num_initial_blocks
828 vaesenclast \T_key, reg_i, reg_i
833 i = (9-\num_initial_blocks)
835 .rep \num_initial_blocks
836 vmovdqu (arg3, %r11), \T1
837 vpxor \T1, reg_i, reg_i
838 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
843 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
849 i = (8-\num_initial_blocks)
850 j = (9-\num_initial_blocks)
853 .rep \num_initial_blocks
854 vpxor reg_i, reg_j, reg_j
855 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
860 # XMM8 has the combined result here
862 vmovdqa \XMM8, TMP1(%rsp)
866 jl _initial_blocks_done\@ # no need for precomputed constants
868 ###############################################################################
869 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
870 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
872 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
874 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
876 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
878 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
880 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
882 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
884 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
886 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
888 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
890 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
892 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
894 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
896 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
898 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
900 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
902 vmovdqa (arg1), \T_key
903 vpxor \T_key, \XMM1, \XMM1
904 vpxor \T_key, \XMM2, \XMM2
905 vpxor \T_key, \XMM3, \XMM3
906 vpxor \T_key, \XMM4, \XMM4
907 vpxor \T_key, \XMM5, \XMM5
908 vpxor \T_key, \XMM6, \XMM6
909 vpxor \T_key, \XMM7, \XMM7
910 vpxor \T_key, \XMM8, \XMM8
915 vmovdqa 16*i(arg1), \T_key
916 vaesenc \T_key, \XMM1, \XMM1
917 vaesenc \T_key, \XMM2, \XMM2
918 vaesenc \T_key, \XMM3, \XMM3
919 vaesenc \T_key, \XMM4, \XMM4
920 vaesenc \T_key, \XMM5, \XMM5
921 vaesenc \T_key, \XMM6, \XMM6
922 vaesenc \T_key, \XMM7, \XMM7
923 vaesenc \T_key, \XMM8, \XMM8
929 vmovdqa 16*i(arg1), \T_key
930 vaesenclast \T_key, \XMM1, \XMM1
931 vaesenclast \T_key, \XMM2, \XMM2
932 vaesenclast \T_key, \XMM3, \XMM3
933 vaesenclast \T_key, \XMM4, \XMM4
934 vaesenclast \T_key, \XMM5, \XMM5
935 vaesenclast \T_key, \XMM6, \XMM6
936 vaesenclast \T_key, \XMM7, \XMM7
937 vaesenclast \T_key, \XMM8, \XMM8
939 vmovdqu (arg3, %r11), \T1
940 vpxor \T1, \XMM1, \XMM1
941 vmovdqu \XMM1, (arg2 , %r11)
946 vmovdqu 16*1(arg3, %r11), \T1
947 vpxor \T1, \XMM2, \XMM2
948 vmovdqu \XMM2, 16*1(arg2 , %r11)
953 vmovdqu 16*2(arg3, %r11), \T1
954 vpxor \T1, \XMM3, \XMM3
955 vmovdqu \XMM3, 16*2(arg2 , %r11)
960 vmovdqu 16*3(arg3, %r11), \T1
961 vpxor \T1, \XMM4, \XMM4
962 vmovdqu \XMM4, 16*3(arg2 , %r11)
967 vmovdqu 16*4(arg3, %r11), \T1
968 vpxor \T1, \XMM5, \XMM5
969 vmovdqu \XMM5, 16*4(arg2 , %r11)
974 vmovdqu 16*5(arg3, %r11), \T1
975 vpxor \T1, \XMM6, \XMM6
976 vmovdqu \XMM6, 16*5(arg2 , %r11)
981 vmovdqu 16*6(arg3, %r11), \T1
982 vpxor \T1, \XMM7, \XMM7
983 vmovdqu \XMM7, 16*6(arg2 , %r11)
988 vmovdqu 16*7(arg3, %r11), \T1
989 vpxor \T1, \XMM8, \XMM8
990 vmovdqu \XMM8, 16*7(arg2 , %r11)
997 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
998 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
999 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1000 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1001 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1002 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1003 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1004 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1005 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1007 ###############################################################################
1009 _initial_blocks_done\@:
1013 # encrypt 8 blocks at a time
1014 # ghash the 8 previously encrypted ciphertext blocks
1015 # arg1, arg2, arg3 are used as pointers only, not modified
1016 # r11 is the data offset value
1017 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1020 vmovdqa \XMM2, TMP2(%rsp)
1021 vmovdqa \XMM3, TMP3(%rsp)
1022 vmovdqa \XMM4, TMP4(%rsp)
1023 vmovdqa \XMM5, TMP5(%rsp)
1024 vmovdqa \XMM6, TMP6(%rsp)
1025 vmovdqa \XMM7, TMP7(%rsp)
1026 vmovdqa \XMM8, TMP8(%rsp)
1028 .if \loop_idx == in_order
1029 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1030 vpaddd ONE(%rip), \XMM1, \XMM2
1031 vpaddd ONE(%rip), \XMM2, \XMM3
1032 vpaddd ONE(%rip), \XMM3, \XMM4
1033 vpaddd ONE(%rip), \XMM4, \XMM5
1034 vpaddd ONE(%rip), \XMM5, \XMM6
1035 vpaddd ONE(%rip), \XMM6, \XMM7
1036 vpaddd ONE(%rip), \XMM7, \XMM8
1039 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1040 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1041 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1042 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1043 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1044 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1045 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1046 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1048 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1049 vpaddd ONEf(%rip), \XMM1, \XMM2
1050 vpaddd ONEf(%rip), \XMM2, \XMM3
1051 vpaddd ONEf(%rip), \XMM3, \XMM4
1052 vpaddd ONEf(%rip), \XMM4, \XMM5
1053 vpaddd ONEf(%rip), \XMM5, \XMM6
1054 vpaddd ONEf(%rip), \XMM6, \XMM7
1055 vpaddd ONEf(%rip), \XMM7, \XMM8
1060 #######################################################################
1063 vpxor \T1, \XMM1, \XMM1
1064 vpxor \T1, \XMM2, \XMM2
1065 vpxor \T1, \XMM3, \XMM3
1066 vpxor \T1, \XMM4, \XMM4
1067 vpxor \T1, \XMM5, \XMM5
1068 vpxor \T1, \XMM6, \XMM6
1069 vpxor \T1, \XMM7, \XMM7
1070 vpxor \T1, \XMM8, \XMM8
1072 #######################################################################
1078 vmovdqu 16*1(arg1), \T1
1079 vaesenc \T1, \XMM1, \XMM1
1080 vaesenc \T1, \XMM2, \XMM2
1081 vaesenc \T1, \XMM3, \XMM3
1082 vaesenc \T1, \XMM4, \XMM4
1083 vaesenc \T1, \XMM5, \XMM5
1084 vaesenc \T1, \XMM6, \XMM6
1085 vaesenc \T1, \XMM7, \XMM7
1086 vaesenc \T1, \XMM8, \XMM8
1088 vmovdqu 16*2(arg1), \T1
1089 vaesenc \T1, \XMM1, \XMM1
1090 vaesenc \T1, \XMM2, \XMM2
1091 vaesenc \T1, \XMM3, \XMM3
1092 vaesenc \T1, \XMM4, \XMM4
1093 vaesenc \T1, \XMM5, \XMM5
1094 vaesenc \T1, \XMM6, \XMM6
1095 vaesenc \T1, \XMM7, \XMM7
1096 vaesenc \T1, \XMM8, \XMM8
1099 #######################################################################
1101 vmovdqa HashKey_8(arg1), \T5
1102 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1103 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1105 vpshufd $0b01001110, \T2, \T6
1108 vmovdqa HashKey_8_k(arg1), \T5
1109 vpclmulqdq $0x00, \T5, \T6, \T6
1111 vmovdqu 16*3(arg1), \T1
1112 vaesenc \T1, \XMM1, \XMM1
1113 vaesenc \T1, \XMM2, \XMM2
1114 vaesenc \T1, \XMM3, \XMM3
1115 vaesenc \T1, \XMM4, \XMM4
1116 vaesenc \T1, \XMM5, \XMM5
1117 vaesenc \T1, \XMM6, \XMM6
1118 vaesenc \T1, \XMM7, \XMM7
1119 vaesenc \T1, \XMM8, \XMM8
1121 vmovdqa TMP2(%rsp), \T1
1122 vmovdqa HashKey_7(arg1), \T5
1123 vpclmulqdq $0x11, \T5, \T1, \T3
1125 vpclmulqdq $0x00, \T5, \T1, \T3
1128 vpshufd $0b01001110, \T1, \T3
1130 vmovdqa HashKey_7_k(arg1), \T5
1131 vpclmulqdq $0x10, \T5, \T3, \T3
1134 vmovdqu 16*4(arg1), \T1
1135 vaesenc \T1, \XMM1, \XMM1
1136 vaesenc \T1, \XMM2, \XMM2
1137 vaesenc \T1, \XMM3, \XMM3
1138 vaesenc \T1, \XMM4, \XMM4
1139 vaesenc \T1, \XMM5, \XMM5
1140 vaesenc \T1, \XMM6, \XMM6
1141 vaesenc \T1, \XMM7, \XMM7
1142 vaesenc \T1, \XMM8, \XMM8
1144 #######################################################################
1146 vmovdqa TMP3(%rsp), \T1
1147 vmovdqa HashKey_6(arg1), \T5
1148 vpclmulqdq $0x11, \T5, \T1, \T3
1150 vpclmulqdq $0x00, \T5, \T1, \T3
1153 vpshufd $0b01001110, \T1, \T3
1155 vmovdqa HashKey_6_k(arg1), \T5
1156 vpclmulqdq $0x10, \T5, \T3, \T3
1159 vmovdqu 16*5(arg1), \T1
1160 vaesenc \T1, \XMM1, \XMM1
1161 vaesenc \T1, \XMM2, \XMM2
1162 vaesenc \T1, \XMM3, \XMM3
1163 vaesenc \T1, \XMM4, \XMM4
1164 vaesenc \T1, \XMM5, \XMM5
1165 vaesenc \T1, \XMM6, \XMM6
1166 vaesenc \T1, \XMM7, \XMM7
1167 vaesenc \T1, \XMM8, \XMM8
1169 vmovdqa TMP4(%rsp), \T1
1170 vmovdqa HashKey_5(arg1), \T5
1171 vpclmulqdq $0x11, \T5, \T1, \T3
1173 vpclmulqdq $0x00, \T5, \T1, \T3
1176 vpshufd $0b01001110, \T1, \T3
1178 vmovdqa HashKey_5_k(arg1), \T5
1179 vpclmulqdq $0x10, \T5, \T3, \T3
1182 vmovdqu 16*6(arg1), \T1
1183 vaesenc \T1, \XMM1, \XMM1
1184 vaesenc \T1, \XMM2, \XMM2
1185 vaesenc \T1, \XMM3, \XMM3
1186 vaesenc \T1, \XMM4, \XMM4
1187 vaesenc \T1, \XMM5, \XMM5
1188 vaesenc \T1, \XMM6, \XMM6
1189 vaesenc \T1, \XMM7, \XMM7
1190 vaesenc \T1, \XMM8, \XMM8
1193 vmovdqa TMP5(%rsp), \T1
1194 vmovdqa HashKey_4(arg1), \T5
1195 vpclmulqdq $0x11, \T5, \T1, \T3
1197 vpclmulqdq $0x00, \T5, \T1, \T3
1200 vpshufd $0b01001110, \T1, \T3
1202 vmovdqa HashKey_4_k(arg1), \T5
1203 vpclmulqdq $0x10, \T5, \T3, \T3
1206 vmovdqu 16*7(arg1), \T1
1207 vaesenc \T1, \XMM1, \XMM1
1208 vaesenc \T1, \XMM2, \XMM2
1209 vaesenc \T1, \XMM3, \XMM3
1210 vaesenc \T1, \XMM4, \XMM4
1211 vaesenc \T1, \XMM5, \XMM5
1212 vaesenc \T1, \XMM6, \XMM6
1213 vaesenc \T1, \XMM7, \XMM7
1214 vaesenc \T1, \XMM8, \XMM8
1216 vmovdqa TMP6(%rsp), \T1
1217 vmovdqa HashKey_3(arg1), \T5
1218 vpclmulqdq $0x11, \T5, \T1, \T3
1220 vpclmulqdq $0x00, \T5, \T1, \T3
1223 vpshufd $0b01001110, \T1, \T3
1225 vmovdqa HashKey_3_k(arg1), \T5
1226 vpclmulqdq $0x10, \T5, \T3, \T3
1230 vmovdqu 16*8(arg1), \T1
1231 vaesenc \T1, \XMM1, \XMM1
1232 vaesenc \T1, \XMM2, \XMM2
1233 vaesenc \T1, \XMM3, \XMM3
1234 vaesenc \T1, \XMM4, \XMM4
1235 vaesenc \T1, \XMM5, \XMM5
1236 vaesenc \T1, \XMM6, \XMM6
1237 vaesenc \T1, \XMM7, \XMM7
1238 vaesenc \T1, \XMM8, \XMM8
1240 vmovdqa TMP7(%rsp), \T1
1241 vmovdqa HashKey_2(arg1), \T5
1242 vpclmulqdq $0x11, \T5, \T1, \T3
1244 vpclmulqdq $0x00, \T5, \T1, \T3
1247 vpshufd $0b01001110, \T1, \T3
1249 vmovdqa HashKey_2_k(arg1), \T5
1250 vpclmulqdq $0x10, \T5, \T3, \T3
1253 #######################################################################
1255 vmovdqu 16*9(arg1), \T5
1256 vaesenc \T5, \XMM1, \XMM1
1257 vaesenc \T5, \XMM2, \XMM2
1258 vaesenc \T5, \XMM3, \XMM3
1259 vaesenc \T5, \XMM4, \XMM4
1260 vaesenc \T5, \XMM5, \XMM5
1261 vaesenc \T5, \XMM6, \XMM6
1262 vaesenc \T5, \XMM7, \XMM7
1263 vaesenc \T5, \XMM8, \XMM8
1265 vmovdqa TMP8(%rsp), \T1
1266 vmovdqa HashKey(arg1), \T5
1267 vpclmulqdq $0x11, \T5, \T1, \T3
1269 vpclmulqdq $0x00, \T5, \T1, \T3
1272 vpshufd $0b01001110, \T1, \T3
1274 vmovdqa HashKey_k(arg1), \T5
1275 vpclmulqdq $0x10, \T5, \T3, \T3
1281 vmovdqu 16*10(arg1), \T5
1287 vpxor 16*i(arg3, %r11), \T5, \T2
1289 vaesenclast \T2, reg_j, reg_j
1291 vaesenclast \T2, reg_j, \T3
1292 vmovdqu 16*i(arg3, %r11), reg_j
1293 vmovdqu \T3, 16*i(arg2, %r11)
1299 #######################################################################
1302 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1303 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1305 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1309 #######################################################################
1310 #first phase of the reduction
1311 #######################################################################
1312 vpslld $31, \T7, \T2 # packed right shifting << 31
1313 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1314 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1316 vpxor \T3, \T2, \T2 # xor the shifted versions
1319 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1321 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1322 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1323 #######################################################################
1325 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
1326 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
1327 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
1328 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
1329 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
1330 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
1331 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
1332 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
1335 #######################################################################
1336 #second phase of the reduction
1337 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1338 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1339 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1340 vpxor \T3, \T2, \T2 # xor the shifted versions
1345 vpxor \T7, \T6, \T6 # the result is in T6
1346 #######################################################################
1348 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1349 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1350 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1351 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1352 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1353 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1354 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1355 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1358 vpxor \T6, \XMM1, \XMM1
1365 # GHASH the last 4 ciphertext blocks.
1366 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1371 vpshufd $0b01001110, \XMM1, \T2
1372 vpxor \XMM1, \T2, \T2
1373 vmovdqa HashKey_8(arg1), \T5
1374 vpclmulqdq $0x11, \T5, \XMM1, \T6
1375 vpclmulqdq $0x00, \T5, \XMM1, \T7
1377 vmovdqa HashKey_8_k(arg1), \T3
1378 vpclmulqdq $0x00, \T3, \T2, \XMM1
1380 ######################
1382 vpshufd $0b01001110, \XMM2, \T2
1383 vpxor \XMM2, \T2, \T2
1384 vmovdqa HashKey_7(arg1), \T5
1385 vpclmulqdq $0x11, \T5, \XMM2, \T4
1388 vpclmulqdq $0x00, \T5, \XMM2, \T4
1391 vmovdqa HashKey_7_k(arg1), \T3
1392 vpclmulqdq $0x00, \T3, \T2, \T2
1393 vpxor \T2, \XMM1, \XMM1
1395 ######################
1397 vpshufd $0b01001110, \XMM3, \T2
1398 vpxor \XMM3, \T2, \T2
1399 vmovdqa HashKey_6(arg1), \T5
1400 vpclmulqdq $0x11, \T5, \XMM3, \T4
1403 vpclmulqdq $0x00, \T5, \XMM3, \T4
1406 vmovdqa HashKey_6_k(arg1), \T3
1407 vpclmulqdq $0x00, \T3, \T2, \T2
1408 vpxor \T2, \XMM1, \XMM1
1410 ######################
1412 vpshufd $0b01001110, \XMM4, \T2
1413 vpxor \XMM4, \T2, \T2
1414 vmovdqa HashKey_5(arg1), \T5
1415 vpclmulqdq $0x11, \T5, \XMM4, \T4
1418 vpclmulqdq $0x00, \T5, \XMM4, \T4
1421 vmovdqa HashKey_5_k(arg1), \T3
1422 vpclmulqdq $0x00, \T3, \T2, \T2
1423 vpxor \T2, \XMM1, \XMM1
1425 ######################
1427 vpshufd $0b01001110, \XMM5, \T2
1428 vpxor \XMM5, \T2, \T2
1429 vmovdqa HashKey_4(arg1), \T5
1430 vpclmulqdq $0x11, \T5, \XMM5, \T4
1433 vpclmulqdq $0x00, \T5, \XMM5, \T4
1436 vmovdqa HashKey_4_k(arg1), \T3
1437 vpclmulqdq $0x00, \T3, \T2, \T2
1438 vpxor \T2, \XMM1, \XMM1
1440 ######################
1442 vpshufd $0b01001110, \XMM6, \T2
1443 vpxor \XMM6, \T2, \T2
1444 vmovdqa HashKey_3(arg1), \T5
1445 vpclmulqdq $0x11, \T5, \XMM6, \T4
1448 vpclmulqdq $0x00, \T5, \XMM6, \T4
1451 vmovdqa HashKey_3_k(arg1), \T3
1452 vpclmulqdq $0x00, \T3, \T2, \T2
1453 vpxor \T2, \XMM1, \XMM1
1455 ######################
1457 vpshufd $0b01001110, \XMM7, \T2
1458 vpxor \XMM7, \T2, \T2
1459 vmovdqa HashKey_2(arg1), \T5
1460 vpclmulqdq $0x11, \T5, \XMM7, \T4
1463 vpclmulqdq $0x00, \T5, \XMM7, \T4
1466 vmovdqa HashKey_2_k(arg1), \T3
1467 vpclmulqdq $0x00, \T3, \T2, \T2
1468 vpxor \T2, \XMM1, \XMM1
1470 ######################
1472 vpshufd $0b01001110, \XMM8, \T2
1473 vpxor \XMM8, \T2, \T2
1474 vmovdqa HashKey(arg1), \T5
1475 vpclmulqdq $0x11, \T5, \XMM8, \T4
1478 vpclmulqdq $0x00, \T5, \XMM8, \T4
1481 vmovdqa HashKey_k(arg1), \T3
1482 vpclmulqdq $0x00, \T3, \T2, \T2
1484 vpxor \T2, \XMM1, \XMM1
1485 vpxor \T6, \XMM1, \XMM1
1486 vpxor \T7, \XMM1, \T2
1491 vpslldq $8, \T2, \T4
1492 vpsrldq $8, \T2, \T2
1495 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1496 # the accumulated carry-less multiplications
1498 #######################################################################
1499 #first phase of the reduction
1500 vpslld $31, \T7, \T2 # packed right shifting << 31
1501 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1502 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1504 vpxor \T3, \T2, \T2 # xor the shifted versions
1507 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1509 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1510 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1511 #######################################################################
1514 #second phase of the reduction
1515 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1516 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1517 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1518 vpxor \T3, \T2, \T2 # xor the shifted versions
1523 vpxor \T7, \T6, \T6 # the result is in T6
1527 #############################################################
1528 #void aesni_gcm_precomp_avx_gen2
1529 # (gcm_data *my_ctx_data,
1530 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1531 #############################################################
1532 ENTRY(aesni_gcm_precomp_avx_gen2)
1533 #the number of pushes must equal STACK_OFFSET
1543 sub $VARIABLE_OFFSET, %rsp
1544 and $~63, %rsp # align rsp to 64 bytes
1546 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1548 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1549 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1550 vmovdqa %xmm6, %xmm2
1551 vpsllq $1, %xmm6, %xmm6
1552 vpsrlq $63, %xmm2, %xmm2
1553 vmovdqa %xmm2, %xmm1
1554 vpslldq $8, %xmm2, %xmm2
1555 vpsrldq $8, %xmm1, %xmm1
1556 vpor %xmm2, %xmm6, %xmm6
1558 vpshufd $0b00100100, %xmm1, %xmm2
1559 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1560 vpand POLY(%rip), %xmm2, %xmm2
1561 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1562 #######################################################################
1563 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1566 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1575 ENDPROC(aesni_gcm_precomp_avx_gen2)
1577 ###############################################################################
1578 #void aesni_gcm_enc_avx_gen2(
1579 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1580 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1581 # const u8 *in, /* Plaintext input */
1582 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1583 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1584 # (from Security Association) concatenated with 8 byte
1585 # Initialisation Vector (from IPSec ESP Payload)
1586 # concatenated with 0x00000001. 16-byte aligned pointer. */
1587 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1588 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1589 # u8 *auth_tag, /* Authenticated Tag output. */
1590 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1591 # Valid values are 16 (most likely), 12 or 8. */
1592 ###############################################################################
1593 ENTRY(aesni_gcm_enc_avx_gen2)
1594 GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC
1596 ENDPROC(aesni_gcm_enc_avx_gen2)
1598 ###############################################################################
1599 #void aesni_gcm_dec_avx_gen2(
1600 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1601 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1602 # const u8 *in, /* Ciphertext input */
1603 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1604 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1605 # (from Security Association) concatenated with 8 byte
1606 # Initialisation Vector (from IPSec ESP Payload)
1607 # concatenated with 0x00000001. 16-byte aligned pointer. */
1608 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1609 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1610 # u8 *auth_tag, /* Authenticated Tag output. */
1611 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1612 # Valid values are 16 (most likely), 12 or 8. */
1613 ###############################################################################
1614 ENTRY(aesni_gcm_dec_avx_gen2)
1615 GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC
1617 ENDPROC(aesni_gcm_dec_avx_gen2)
1618 #endif /* CONFIG_AS_AVX */
1620 #ifdef CONFIG_AS_AVX2
1621 ###############################################################################
1622 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1623 # Input: A and B (128-bits each, bit-reflected)
1624 # Output: C = A*B*x mod poly, (i.e. >>1 )
1625 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1626 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1627 ###############################################################################
1628 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1630 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1631 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1632 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1633 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1637 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1638 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1643 #######################################################################
1644 #first phase of the reduction
1645 vmovdqa POLY2(%rip), \T3
1647 vpclmulqdq $0x01, \GH, \T3, \T2
1648 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1650 vpxor \T2, \GH, \GH # first phase of the reduction complete
1651 #######################################################################
1652 #second phase of the reduction
1653 vpclmulqdq $0x00, \GH, \T3, \T2
1654 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1656 vpclmulqdq $0x10, \GH, \T3, \GH
1657 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1659 vpxor \T2, \GH, \GH # second phase of the reduction complete
1660 #######################################################################
1661 vpxor \T1, \GH, \GH # the result is in GH
1666 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1668 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1670 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1671 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1673 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1674 vmovdqa \T5, HashKey_3(arg1)
1676 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1677 vmovdqa \T5, HashKey_4(arg1)
1679 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1680 vmovdqa \T5, HashKey_5(arg1)
1682 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1683 vmovdqa \T5, HashKey_6(arg1)
1685 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1686 vmovdqa \T5, HashKey_7(arg1)
1688 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1689 vmovdqa \T5, HashKey_8(arg1)
1694 ## if a = number of total plaintext bytes
1696 ## num_initial_blocks = b mod 4#
1697 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1698 ## r10, r11, r12, rax are clobbered
1699 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1701 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1702 i = (8-\num_initial_blocks)
1706 mov arg6, %r10 # r10 = AAD
1707 mov arg7, %r12 # r12 = aadLen
1712 vpxor reg_j, reg_j, reg_j
1713 vpxor reg_i, reg_i, reg_i
1718 vmovdqu (%r10), reg_i
1719 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1720 vpxor reg_i, reg_j, reg_j
1721 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1726 jge _get_AAD_blocks\@
1727 vmovdqu reg_j, reg_i
1731 vpxor reg_i, reg_i, reg_i
1733 /* read the last <16B of AAD. since we have at least 4B of
1734 data right after the AAD (the ICV, and maybe some CT), we can
1735 read 4B/8B blocks safely, and then get rid of the extra stuff */
1738 jle _get_AAD_rest4\@
1742 vpslldq $8, \T1, \T1
1743 vpsrldq $8, reg_i, reg_i
1744 vpxor \T1, reg_i, reg_i
1745 jmp _get_AAD_rest8\@
1748 jle _get_AAD_rest0\@
1753 vpslldq $12, \T1, \T1
1754 vpsrldq $4, reg_i, reg_i
1755 vpxor \T1, reg_i, reg_i
1757 /* finalize: shift out the extra bytes we read, and align
1758 left. since pslldq can only shift by an immediate, we use
1759 vpshufb and an array of shuffle masks */
1762 movdqu aad_shift_arr(%r11), \T1
1763 vpshufb \T1, reg_i, reg_i
1764 _get_AAD_rest_final\@:
1765 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1766 vpxor reg_j, reg_i, reg_i
1767 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1770 # initialize the data pointer offset as zero
1773 # start AES for num_initial_blocks blocks
1774 mov arg5, %rax # rax = *Y0
1775 vmovdqu (%rax), \CTR # CTR = Y0
1776 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1779 i = (9-\num_initial_blocks)
1781 .rep \num_initial_blocks
1782 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1784 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1789 vmovdqa (arg1), \T_key
1790 i = (9-\num_initial_blocks)
1792 .rep \num_initial_blocks
1793 vpxor \T_key, reg_i, reg_i
1801 vmovdqa 16*j(arg1), \T_key
1802 i = (9-\num_initial_blocks)
1804 .rep \num_initial_blocks
1805 vaesenc \T_key, reg_i, reg_i
1815 vmovdqa 16*10(arg1), \T_key
1816 i = (9-\num_initial_blocks)
1818 .rep \num_initial_blocks
1819 vaesenclast \T_key, reg_i, reg_i
1824 i = (9-\num_initial_blocks)
1826 .rep \num_initial_blocks
1827 vmovdqu (arg3, %r11), \T1
1828 vpxor \T1, reg_i, reg_i
1829 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1830 # num_initial_blocks blocks
1835 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1841 i = (8-\num_initial_blocks)
1842 j = (9-\num_initial_blocks)
1845 .rep \num_initial_blocks
1846 vpxor reg_i, reg_j, reg_j
1847 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1852 # XMM8 has the combined result here
1854 vmovdqa \XMM8, TMP1(%rsp)
1858 jl _initial_blocks_done\@ # no need for precomputed constants
1860 ###############################################################################
1861 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1862 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1864 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1866 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1868 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1870 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1872 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1874 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1876 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1878 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1880 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1882 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1884 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1886 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1888 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1890 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1892 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1894 vmovdqa (arg1), \T_key
1895 vpxor \T_key, \XMM1, \XMM1
1896 vpxor \T_key, \XMM2, \XMM2
1897 vpxor \T_key, \XMM3, \XMM3
1898 vpxor \T_key, \XMM4, \XMM4
1899 vpxor \T_key, \XMM5, \XMM5
1900 vpxor \T_key, \XMM6, \XMM6
1901 vpxor \T_key, \XMM7, \XMM7
1902 vpxor \T_key, \XMM8, \XMM8
1906 .rep 9 # do 9 rounds
1907 vmovdqa 16*i(arg1), \T_key
1908 vaesenc \T_key, \XMM1, \XMM1
1909 vaesenc \T_key, \XMM2, \XMM2
1910 vaesenc \T_key, \XMM3, \XMM3
1911 vaesenc \T_key, \XMM4, \XMM4
1912 vaesenc \T_key, \XMM5, \XMM5
1913 vaesenc \T_key, \XMM6, \XMM6
1914 vaesenc \T_key, \XMM7, \XMM7
1915 vaesenc \T_key, \XMM8, \XMM8
1921 vmovdqa 16*i(arg1), \T_key
1922 vaesenclast \T_key, \XMM1, \XMM1
1923 vaesenclast \T_key, \XMM2, \XMM2
1924 vaesenclast \T_key, \XMM3, \XMM3
1925 vaesenclast \T_key, \XMM4, \XMM4
1926 vaesenclast \T_key, \XMM5, \XMM5
1927 vaesenclast \T_key, \XMM6, \XMM6
1928 vaesenclast \T_key, \XMM7, \XMM7
1929 vaesenclast \T_key, \XMM8, \XMM8
1931 vmovdqu (arg3, %r11), \T1
1932 vpxor \T1, \XMM1, \XMM1
1933 vmovdqu \XMM1, (arg2 , %r11)
1938 vmovdqu 16*1(arg3, %r11), \T1
1939 vpxor \T1, \XMM2, \XMM2
1940 vmovdqu \XMM2, 16*1(arg2 , %r11)
1945 vmovdqu 16*2(arg3, %r11), \T1
1946 vpxor \T1, \XMM3, \XMM3
1947 vmovdqu \XMM3, 16*2(arg2 , %r11)
1952 vmovdqu 16*3(arg3, %r11), \T1
1953 vpxor \T1, \XMM4, \XMM4
1954 vmovdqu \XMM4, 16*3(arg2 , %r11)
1959 vmovdqu 16*4(arg3, %r11), \T1
1960 vpxor \T1, \XMM5, \XMM5
1961 vmovdqu \XMM5, 16*4(arg2 , %r11)
1966 vmovdqu 16*5(arg3, %r11), \T1
1967 vpxor \T1, \XMM6, \XMM6
1968 vmovdqu \XMM6, 16*5(arg2 , %r11)
1973 vmovdqu 16*6(arg3, %r11), \T1
1974 vpxor \T1, \XMM7, \XMM7
1975 vmovdqu \XMM7, 16*6(arg2 , %r11)
1980 vmovdqu 16*7(arg3, %r11), \T1
1981 vpxor \T1, \XMM8, \XMM8
1982 vmovdqu \XMM8, 16*7(arg2 , %r11)
1989 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1990 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1991 # the corresponding ciphertext
1992 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1993 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1994 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1995 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1996 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1997 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1998 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2000 ###############################################################################
2002 _initial_blocks_done\@:
2009 # encrypt 8 blocks at a time
2010 # ghash the 8 previously encrypted ciphertext blocks
2011 # arg1, arg2, arg3 are used as pointers only, not modified
2012 # r11 is the data offset value
2013 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2016 vmovdqa \XMM2, TMP2(%rsp)
2017 vmovdqa \XMM3, TMP3(%rsp)
2018 vmovdqa \XMM4, TMP4(%rsp)
2019 vmovdqa \XMM5, TMP5(%rsp)
2020 vmovdqa \XMM6, TMP6(%rsp)
2021 vmovdqa \XMM7, TMP7(%rsp)
2022 vmovdqa \XMM8, TMP8(%rsp)
2024 .if \loop_idx == in_order
2025 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2026 vpaddd ONE(%rip), \XMM1, \XMM2
2027 vpaddd ONE(%rip), \XMM2, \XMM3
2028 vpaddd ONE(%rip), \XMM3, \XMM4
2029 vpaddd ONE(%rip), \XMM4, \XMM5
2030 vpaddd ONE(%rip), \XMM5, \XMM6
2031 vpaddd ONE(%rip), \XMM6, \XMM7
2032 vpaddd ONE(%rip), \XMM7, \XMM8
2035 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2036 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2037 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2038 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2039 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2040 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2041 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2042 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2044 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2045 vpaddd ONEf(%rip), \XMM1, \XMM2
2046 vpaddd ONEf(%rip), \XMM2, \XMM3
2047 vpaddd ONEf(%rip), \XMM3, \XMM4
2048 vpaddd ONEf(%rip), \XMM4, \XMM5
2049 vpaddd ONEf(%rip), \XMM5, \XMM6
2050 vpaddd ONEf(%rip), \XMM6, \XMM7
2051 vpaddd ONEf(%rip), \XMM7, \XMM8
2056 #######################################################################
2059 vpxor \T1, \XMM1, \XMM1
2060 vpxor \T1, \XMM2, \XMM2
2061 vpxor \T1, \XMM3, \XMM3
2062 vpxor \T1, \XMM4, \XMM4
2063 vpxor \T1, \XMM5, \XMM5
2064 vpxor \T1, \XMM6, \XMM6
2065 vpxor \T1, \XMM7, \XMM7
2066 vpxor \T1, \XMM8, \XMM8
2068 #######################################################################
2074 vmovdqu 16*1(arg1), \T1
2075 vaesenc \T1, \XMM1, \XMM1
2076 vaesenc \T1, \XMM2, \XMM2
2077 vaesenc \T1, \XMM3, \XMM3
2078 vaesenc \T1, \XMM4, \XMM4
2079 vaesenc \T1, \XMM5, \XMM5
2080 vaesenc \T1, \XMM6, \XMM6
2081 vaesenc \T1, \XMM7, \XMM7
2082 vaesenc \T1, \XMM8, \XMM8
2084 vmovdqu 16*2(arg1), \T1
2085 vaesenc \T1, \XMM1, \XMM1
2086 vaesenc \T1, \XMM2, \XMM2
2087 vaesenc \T1, \XMM3, \XMM3
2088 vaesenc \T1, \XMM4, \XMM4
2089 vaesenc \T1, \XMM5, \XMM5
2090 vaesenc \T1, \XMM6, \XMM6
2091 vaesenc \T1, \XMM7, \XMM7
2092 vaesenc \T1, \XMM8, \XMM8
2095 #######################################################################
2097 vmovdqa HashKey_8(arg1), \T5
2098 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2099 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2100 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2101 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2104 vmovdqu 16*3(arg1), \T1
2105 vaesenc \T1, \XMM1, \XMM1
2106 vaesenc \T1, \XMM2, \XMM2
2107 vaesenc \T1, \XMM3, \XMM3
2108 vaesenc \T1, \XMM4, \XMM4
2109 vaesenc \T1, \XMM5, \XMM5
2110 vaesenc \T1, \XMM6, \XMM6
2111 vaesenc \T1, \XMM7, \XMM7
2112 vaesenc \T1, \XMM8, \XMM8
2114 vmovdqa TMP2(%rsp), \T1
2115 vmovdqa HashKey_7(arg1), \T5
2116 vpclmulqdq $0x11, \T5, \T1, \T3
2119 vpclmulqdq $0x00, \T5, \T1, \T3
2122 vpclmulqdq $0x01, \T5, \T1, \T3
2125 vpclmulqdq $0x10, \T5, \T1, \T3
2128 vmovdqu 16*4(arg1), \T1
2129 vaesenc \T1, \XMM1, \XMM1
2130 vaesenc \T1, \XMM2, \XMM2
2131 vaesenc \T1, \XMM3, \XMM3
2132 vaesenc \T1, \XMM4, \XMM4
2133 vaesenc \T1, \XMM5, \XMM5
2134 vaesenc \T1, \XMM6, \XMM6
2135 vaesenc \T1, \XMM7, \XMM7
2136 vaesenc \T1, \XMM8, \XMM8
2138 #######################################################################
2140 vmovdqa TMP3(%rsp), \T1
2141 vmovdqa HashKey_6(arg1), \T5
2142 vpclmulqdq $0x11, \T5, \T1, \T3
2145 vpclmulqdq $0x00, \T5, \T1, \T3
2148 vpclmulqdq $0x01, \T5, \T1, \T3
2151 vpclmulqdq $0x10, \T5, \T1, \T3
2154 vmovdqu 16*5(arg1), \T1
2155 vaesenc \T1, \XMM1, \XMM1
2156 vaesenc \T1, \XMM2, \XMM2
2157 vaesenc \T1, \XMM3, \XMM3
2158 vaesenc \T1, \XMM4, \XMM4
2159 vaesenc \T1, \XMM5, \XMM5
2160 vaesenc \T1, \XMM6, \XMM6
2161 vaesenc \T1, \XMM7, \XMM7
2162 vaesenc \T1, \XMM8, \XMM8
2164 vmovdqa TMP4(%rsp), \T1
2165 vmovdqa HashKey_5(arg1), \T5
2166 vpclmulqdq $0x11, \T5, \T1, \T3
2169 vpclmulqdq $0x00, \T5, \T1, \T3
2172 vpclmulqdq $0x01, \T5, \T1, \T3
2175 vpclmulqdq $0x10, \T5, \T1, \T3
2178 vmovdqu 16*6(arg1), \T1
2179 vaesenc \T1, \XMM1, \XMM1
2180 vaesenc \T1, \XMM2, \XMM2
2181 vaesenc \T1, \XMM3, \XMM3
2182 vaesenc \T1, \XMM4, \XMM4
2183 vaesenc \T1, \XMM5, \XMM5
2184 vaesenc \T1, \XMM6, \XMM6
2185 vaesenc \T1, \XMM7, \XMM7
2186 vaesenc \T1, \XMM8, \XMM8
2189 vmovdqa TMP5(%rsp), \T1
2190 vmovdqa HashKey_4(arg1), \T5
2191 vpclmulqdq $0x11, \T5, \T1, \T3
2194 vpclmulqdq $0x00, \T5, \T1, \T3
2197 vpclmulqdq $0x01, \T5, \T1, \T3
2200 vpclmulqdq $0x10, \T5, \T1, \T3
2203 vmovdqu 16*7(arg1), \T1
2204 vaesenc \T1, \XMM1, \XMM1
2205 vaesenc \T1, \XMM2, \XMM2
2206 vaesenc \T1, \XMM3, \XMM3
2207 vaesenc \T1, \XMM4, \XMM4
2208 vaesenc \T1, \XMM5, \XMM5
2209 vaesenc \T1, \XMM6, \XMM6
2210 vaesenc \T1, \XMM7, \XMM7
2211 vaesenc \T1, \XMM8, \XMM8
2213 vmovdqa TMP6(%rsp), \T1
2214 vmovdqa HashKey_3(arg1), \T5
2215 vpclmulqdq $0x11, \T5, \T1, \T3
2218 vpclmulqdq $0x00, \T5, \T1, \T3
2221 vpclmulqdq $0x01, \T5, \T1, \T3
2224 vpclmulqdq $0x10, \T5, \T1, \T3
2227 vmovdqu 16*8(arg1), \T1
2228 vaesenc \T1, \XMM1, \XMM1
2229 vaesenc \T1, \XMM2, \XMM2
2230 vaesenc \T1, \XMM3, \XMM3
2231 vaesenc \T1, \XMM4, \XMM4
2232 vaesenc \T1, \XMM5, \XMM5
2233 vaesenc \T1, \XMM6, \XMM6
2234 vaesenc \T1, \XMM7, \XMM7
2235 vaesenc \T1, \XMM8, \XMM8
2237 vmovdqa TMP7(%rsp), \T1
2238 vmovdqa HashKey_2(arg1), \T5
2239 vpclmulqdq $0x11, \T5, \T1, \T3
2242 vpclmulqdq $0x00, \T5, \T1, \T3
2245 vpclmulqdq $0x01, \T5, \T1, \T3
2248 vpclmulqdq $0x10, \T5, \T1, \T3
2252 #######################################################################
2254 vmovdqu 16*9(arg1), \T5
2255 vaesenc \T5, \XMM1, \XMM1
2256 vaesenc \T5, \XMM2, \XMM2
2257 vaesenc \T5, \XMM3, \XMM3
2258 vaesenc \T5, \XMM4, \XMM4
2259 vaesenc \T5, \XMM5, \XMM5
2260 vaesenc \T5, \XMM6, \XMM6
2261 vaesenc \T5, \XMM7, \XMM7
2262 vaesenc \T5, \XMM8, \XMM8
2264 vmovdqa TMP8(%rsp), \T1
2265 vmovdqa HashKey(arg1), \T5
2267 vpclmulqdq $0x00, \T5, \T1, \T3
2270 vpclmulqdq $0x01, \T5, \T1, \T3
2273 vpclmulqdq $0x10, \T5, \T1, \T3
2276 vpclmulqdq $0x11, \T5, \T1, \T3
2280 vmovdqu 16*10(arg1), \T5
2286 vpxor 16*i(arg3, %r11), \T5, \T2
2288 vaesenclast \T2, reg_j, reg_j
2290 vaesenclast \T2, reg_j, \T3
2291 vmovdqu 16*i(arg3, %r11), reg_j
2292 vmovdqu \T3, 16*i(arg2, %r11)
2298 #######################################################################
2301 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2302 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2304 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2308 #######################################################################
2309 #first phase of the reduction
2310 vmovdqa POLY2(%rip), \T3
2312 vpclmulqdq $0x01, \T7, \T3, \T2
2313 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2315 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2316 #######################################################################
2318 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2319 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2320 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2321 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2322 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2323 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2324 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2325 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2328 #######################################################################
2329 #second phase of the reduction
2330 vpclmulqdq $0x00, \T7, \T3, \T2
2331 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2333 vpclmulqdq $0x10, \T7, \T3, \T4
2334 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2336 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2337 #######################################################################
2338 vpxor \T4, \T1, \T1 # the result is in T1
2340 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2341 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2342 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2343 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2344 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2345 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2346 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2347 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2350 vpxor \T1, \XMM1, \XMM1
2357 # GHASH the last 4 ciphertext blocks.
2358 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2362 vmovdqa HashKey_8(arg1), \T5
2364 vpshufd $0b01001110, \XMM1, \T2
2365 vpshufd $0b01001110, \T5, \T3
2366 vpxor \XMM1, \T2, \T2
2369 vpclmulqdq $0x11, \T5, \XMM1, \T6
2370 vpclmulqdq $0x00, \T5, \XMM1, \T7
2372 vpclmulqdq $0x00, \T3, \T2, \XMM1
2374 ######################
2376 vmovdqa HashKey_7(arg1), \T5
2377 vpshufd $0b01001110, \XMM2, \T2
2378 vpshufd $0b01001110, \T5, \T3
2379 vpxor \XMM2, \T2, \T2
2382 vpclmulqdq $0x11, \T5, \XMM2, \T4
2385 vpclmulqdq $0x00, \T5, \XMM2, \T4
2388 vpclmulqdq $0x00, \T3, \T2, \T2
2390 vpxor \T2, \XMM1, \XMM1
2392 ######################
2394 vmovdqa HashKey_6(arg1), \T5
2395 vpshufd $0b01001110, \XMM3, \T2
2396 vpshufd $0b01001110, \T5, \T3
2397 vpxor \XMM3, \T2, \T2
2400 vpclmulqdq $0x11, \T5, \XMM3, \T4
2403 vpclmulqdq $0x00, \T5, \XMM3, \T4
2406 vpclmulqdq $0x00, \T3, \T2, \T2
2408 vpxor \T2, \XMM1, \XMM1
2410 ######################
2412 vmovdqa HashKey_5(arg1), \T5
2413 vpshufd $0b01001110, \XMM4, \T2
2414 vpshufd $0b01001110, \T5, \T3
2415 vpxor \XMM4, \T2, \T2
2418 vpclmulqdq $0x11, \T5, \XMM4, \T4
2421 vpclmulqdq $0x00, \T5, \XMM4, \T4
2424 vpclmulqdq $0x00, \T3, \T2, \T2
2426 vpxor \T2, \XMM1, \XMM1
2428 ######################
2430 vmovdqa HashKey_4(arg1), \T5
2431 vpshufd $0b01001110, \XMM5, \T2
2432 vpshufd $0b01001110, \T5, \T3
2433 vpxor \XMM5, \T2, \T2
2436 vpclmulqdq $0x11, \T5, \XMM5, \T4
2439 vpclmulqdq $0x00, \T5, \XMM5, \T4
2442 vpclmulqdq $0x00, \T3, \T2, \T2
2444 vpxor \T2, \XMM1, \XMM1
2446 ######################
2448 vmovdqa HashKey_3(arg1), \T5
2449 vpshufd $0b01001110, \XMM6, \T2
2450 vpshufd $0b01001110, \T5, \T3
2451 vpxor \XMM6, \T2, \T2
2454 vpclmulqdq $0x11, \T5, \XMM6, \T4
2457 vpclmulqdq $0x00, \T5, \XMM6, \T4
2460 vpclmulqdq $0x00, \T3, \T2, \T2
2462 vpxor \T2, \XMM1, \XMM1
2464 ######################
2466 vmovdqa HashKey_2(arg1), \T5
2467 vpshufd $0b01001110, \XMM7, \T2
2468 vpshufd $0b01001110, \T5, \T3
2469 vpxor \XMM7, \T2, \T2
2472 vpclmulqdq $0x11, \T5, \XMM7, \T4
2475 vpclmulqdq $0x00, \T5, \XMM7, \T4
2478 vpclmulqdq $0x00, \T3, \T2, \T2
2480 vpxor \T2, \XMM1, \XMM1
2482 ######################
2484 vmovdqa HashKey(arg1), \T5
2485 vpshufd $0b01001110, \XMM8, \T2
2486 vpshufd $0b01001110, \T5, \T3
2487 vpxor \XMM8, \T2, \T2
2490 vpclmulqdq $0x11, \T5, \XMM8, \T4
2493 vpclmulqdq $0x00, \T5, \XMM8, \T4
2496 vpclmulqdq $0x00, \T3, \T2, \T2
2498 vpxor \T2, \XMM1, \XMM1
2499 vpxor \T6, \XMM1, \XMM1
2500 vpxor \T7, \XMM1, \T2
2505 vpslldq $8, \T2, \T4
2506 vpsrldq $8, \T2, \T2
2509 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2510 # accumulated carry-less multiplications
2512 #######################################################################
2513 #first phase of the reduction
2514 vmovdqa POLY2(%rip), \T3
2516 vpclmulqdq $0x01, \T7, \T3, \T2
2517 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2519 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2520 #######################################################################
2523 #second phase of the reduction
2524 vpclmulqdq $0x00, \T7, \T3, \T2
2525 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2527 vpclmulqdq $0x10, \T7, \T3, \T4
2528 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2530 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2531 #######################################################################
2532 vpxor \T4, \T6, \T6 # the result is in T6
2537 #############################################################
2538 #void aesni_gcm_precomp_avx_gen4
2539 # (gcm_data *my_ctx_data,
2540 # u8 *hash_subkey)# /* H, the Hash sub key input.
2541 # Data starts on a 16-byte boundary. */
2542 #############################################################
2543 ENTRY(aesni_gcm_precomp_avx_gen4)
2544 #the number of pushes must equal STACK_OFFSET
2554 sub $VARIABLE_OFFSET, %rsp
2555 and $~63, %rsp # align rsp to 64 bytes
2557 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2559 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2560 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2561 vmovdqa %xmm6, %xmm2
2562 vpsllq $1, %xmm6, %xmm6
2563 vpsrlq $63, %xmm2, %xmm2
2564 vmovdqa %xmm2, %xmm1
2565 vpslldq $8, %xmm2, %xmm2
2566 vpsrldq $8, %xmm1, %xmm1
2567 vpor %xmm2, %xmm6, %xmm6
2569 vpshufd $0b00100100, %xmm1, %xmm2
2570 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2571 vpand POLY(%rip), %xmm2, %xmm2
2572 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2573 #######################################################################
2574 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2577 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2586 ENDPROC(aesni_gcm_precomp_avx_gen4)
2589 ###############################################################################
2590 #void aesni_gcm_enc_avx_gen4(
2591 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2592 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2593 # const u8 *in, /* Plaintext input */
2594 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2595 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2596 # (from Security Association) concatenated with 8 byte
2597 # Initialisation Vector (from IPSec ESP Payload)
2598 # concatenated with 0x00000001. 16-byte aligned pointer. */
2599 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2600 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2601 # u8 *auth_tag, /* Authenticated Tag output. */
2602 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2603 # Valid values are 16 (most likely), 12 or 8. */
2604 ###############################################################################
2605 ENTRY(aesni_gcm_enc_avx_gen4)
2606 GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC
2608 ENDPROC(aesni_gcm_enc_avx_gen4)
2610 ###############################################################################
2611 #void aesni_gcm_dec_avx_gen4(
2612 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2613 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2614 # const u8 *in, /* Ciphertext input */
2615 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2616 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2617 # (from Security Association) concatenated with 8 byte
2618 # Initialisation Vector (from IPSec ESP Payload)
2619 # concatenated with 0x00000001. 16-byte aligned pointer. */
2620 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2621 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2622 # u8 *auth_tag, /* Authenticated Tag output. */
2623 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2624 # Valid values are 16 (most likely), 12 or 8. */
2625 ###############################################################################
2626 ENTRY(aesni_gcm_dec_avx_gen4)
2627 GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC
2629 ENDPROC(aesni_gcm_dec_avx_gen4)
2631 #endif /* CONFIG_AS_AVX2 */