1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * SSE2 implementation of MORUS-1280
5 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
6 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
9 #include <linux/linkage.h>
10 #include <asm/frame.h>
12 #define SHUFFLE_MASK(i0, i1, i2, i3) \
13 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
15 #define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
17 #define STATE0_LO %xmm0
18 #define STATE0_HI %xmm1
19 #define STATE1_LO %xmm2
20 #define STATE1_HI %xmm3
21 #define STATE2_LO %xmm4
22 #define STATE2_HI %xmm5
23 #define STATE3_LO %xmm6
24 #define STATE3_HI %xmm7
25 #define STATE4_LO %xmm8
26 #define STATE4_HI %xmm9
36 .section .rodata.cst16.morus640_const, "aM", @progbits, 16
39 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
40 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
42 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
43 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
45 .section .rodata.cst16.morus640_counter, "aM", @progbits, 16
48 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
49 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
51 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
52 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
58 * HI_1 | HI_0 || LO_1 | LO_0
60 * HI_0 | HI_1 || LO_1 | LO_0
62 * HI_0 | LO_1 || LO_0 | HI_1
64 pshufd $MASK2, \hi, \hi
80 * HI_1 | HI_0 || LO_1 | LO_0
82 * HI_0 | HI_1 || LO_1 | LO_0
84 * LO_0 | HI_1 || HI_0 | LO_1
86 pshufd $MASK2, \hi, \hi
93 .macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
107 psrlq $(64 - \b), \s0_l
112 psrlq $(64 - \b), \s0_h
119 * __morus1280_update: internal ABI
121 * STATE[0-4] - input state
122 * MSG - message block
124 * STATE[0-4] - output state
130 STATE0_LO, STATE0_HI, \
131 STATE1_LO, STATE1_HI, \
132 STATE2_LO, STATE2_HI, \
133 STATE3_LO, STATE3_HI, \
134 STATE4_LO, STATE4_HI, \
136 pxor MSG_LO, STATE1_LO
137 pxor MSG_HI, STATE1_HI
139 STATE1_LO, STATE1_HI, \
140 STATE2_LO, STATE2_HI, \
141 STATE3_LO, STATE3_HI, \
142 STATE4_LO, STATE4_HI, \
143 STATE0_LO, STATE0_HI, \
145 pxor MSG_LO, STATE2_LO
146 pxor MSG_HI, STATE2_HI
148 STATE2_LO, STATE2_HI, \
149 STATE3_LO, STATE3_HI, \
150 STATE4_LO, STATE4_HI, \
151 STATE0_LO, STATE0_HI, \
152 STATE1_LO, STATE1_HI, \
154 pxor MSG_LO, STATE3_LO
155 pxor MSG_HI, STATE3_HI
157 STATE3_LO, STATE3_HI, \
158 STATE4_LO, STATE4_HI, \
159 STATE0_LO, STATE0_HI, \
160 STATE1_LO, STATE1_HI, \
161 STATE2_LO, STATE2_HI, \
163 pxor MSG_LO, STATE4_LO
164 pxor MSG_HI, STATE4_HI
166 STATE4_LO, STATE4_HI, \
167 STATE0_LO, STATE0_HI, \
168 STATE1_LO, STATE1_HI, \
169 STATE2_LO, STATE2_HI, \
170 STATE3_LO, STATE3_HI, \
173 ENDPROC(__morus1280_update)
176 * __morus1280_update_zero: internal ABI
178 * STATE[0-4] - input state
180 * STATE[0-4] - output state
184 __morus1280_update_zero:
186 STATE0_LO, STATE0_HI, \
187 STATE1_LO, STATE1_HI, \
188 STATE2_LO, STATE2_HI, \
189 STATE3_LO, STATE3_HI, \
190 STATE4_LO, STATE4_HI, \
193 STATE1_LO, STATE1_HI, \
194 STATE2_LO, STATE2_HI, \
195 STATE3_LO, STATE3_HI, \
196 STATE4_LO, STATE4_HI, \
197 STATE0_LO, STATE0_HI, \
200 STATE2_LO, STATE2_HI, \
201 STATE3_LO, STATE3_HI, \
202 STATE4_LO, STATE4_HI, \
203 STATE0_LO, STATE0_HI, \
204 STATE1_LO, STATE1_HI, \
207 STATE3_LO, STATE3_HI, \
208 STATE4_LO, STATE4_HI, \
209 STATE0_LO, STATE0_HI, \
210 STATE1_LO, STATE1_HI, \
211 STATE2_LO, STATE2_HI, \
214 STATE4_LO, STATE4_HI, \
215 STATE0_LO, STATE0_HI, \
216 STATE1_LO, STATE1_HI, \
217 STATE2_LO, STATE2_HI, \
218 STATE3_LO, STATE3_HI, \
221 ENDPROC(__morus1280_update_zero)
224 * __load_partial: internal ABI
229 * MSG - message block
290 movdqa MSG_LO, MSG_HI
291 movdqu (%rsi), MSG_LO
295 ENDPROC(__load_partial)
298 * __store_partial: internal ABI
363 ENDPROC(__store_partial)
366 * void crypto_morus1280_sse2_init(void *state, const void *key,
369 ENTRY(crypto_morus1280_sse2_init)
373 pxor STATE0_HI, STATE0_HI
374 movdqu (%rdx), STATE0_LO
376 movdqu 0(%rsi), KEY_LO
377 movdqu 16(%rsi), KEY_HI
378 movdqa KEY_LO, STATE1_LO
379 movdqa KEY_HI, STATE1_HI
381 pcmpeqd STATE2_LO, STATE2_LO
382 pcmpeqd STATE2_HI, STATE2_HI
383 /* load all zeros: */
384 pxor STATE3_LO, STATE3_LO
385 pxor STATE3_HI, STATE3_HI
386 /* load the constant: */
387 movdqa .Lmorus640_const_0, STATE4_LO
388 movdqa .Lmorus640_const_1, STATE4_HI
390 /* update 16 times with zero: */
391 call __morus1280_update_zero
392 call __morus1280_update_zero
393 call __morus1280_update_zero
394 call __morus1280_update_zero
395 call __morus1280_update_zero
396 call __morus1280_update_zero
397 call __morus1280_update_zero
398 call __morus1280_update_zero
399 call __morus1280_update_zero
400 call __morus1280_update_zero
401 call __morus1280_update_zero
402 call __morus1280_update_zero
403 call __morus1280_update_zero
404 call __morus1280_update_zero
405 call __morus1280_update_zero
406 call __morus1280_update_zero
408 /* xor-in the key again after updates: */
409 pxor KEY_LO, STATE1_LO
410 pxor KEY_HI, STATE1_HI
412 /* store the state: */
413 movdqu STATE0_LO, (0 * 16)(%rdi)
414 movdqu STATE0_HI, (1 * 16)(%rdi)
415 movdqu STATE1_LO, (2 * 16)(%rdi)
416 movdqu STATE1_HI, (3 * 16)(%rdi)
417 movdqu STATE2_LO, (4 * 16)(%rdi)
418 movdqu STATE2_HI, (5 * 16)(%rdi)
419 movdqu STATE3_LO, (6 * 16)(%rdi)
420 movdqu STATE3_HI, (7 * 16)(%rdi)
421 movdqu STATE4_LO, (8 * 16)(%rdi)
422 movdqu STATE4_HI, (9 * 16)(%rdi)
426 ENDPROC(crypto_morus1280_sse2_init)
429 * void crypto_morus1280_sse2_ad(void *state, const void *data,
430 * unsigned int length);
432 ENTRY(crypto_morus1280_sse2_ad)
438 /* load the state: */
439 movdqu (0 * 16)(%rdi), STATE0_LO
440 movdqu (1 * 16)(%rdi), STATE0_HI
441 movdqu (2 * 16)(%rdi), STATE1_LO
442 movdqu (3 * 16)(%rdi), STATE1_HI
443 movdqu (4 * 16)(%rdi), STATE2_LO
444 movdqu (5 * 16)(%rdi), STATE2_HI
445 movdqu (6 * 16)(%rdi), STATE3_LO
446 movdqu (7 * 16)(%rdi), STATE3_HI
447 movdqu (8 * 16)(%rdi), STATE4_LO
448 movdqu (9 * 16)(%rdi), STATE4_HI
456 movdqa 0(%rsi), MSG_LO
457 movdqa 16(%rsi), MSG_HI
458 call __morus1280_update
467 movdqu 0(%rsi), MSG_LO
468 movdqu 16(%rsi), MSG_HI
469 call __morus1280_update
476 /* store the state: */
477 movdqu STATE0_LO, (0 * 16)(%rdi)
478 movdqu STATE0_HI, (1 * 16)(%rdi)
479 movdqu STATE1_LO, (2 * 16)(%rdi)
480 movdqu STATE1_HI, (3 * 16)(%rdi)
481 movdqu STATE2_LO, (4 * 16)(%rdi)
482 movdqu STATE2_HI, (5 * 16)(%rdi)
483 movdqu STATE3_LO, (6 * 16)(%rdi)
484 movdqu STATE3_HI, (7 * 16)(%rdi)
485 movdqu STATE4_LO, (8 * 16)(%rdi)
486 movdqu STATE4_HI, (9 * 16)(%rdi)
491 ENDPROC(crypto_morus1280_sse2_ad)
494 * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
495 * unsigned int length);
497 ENTRY(crypto_morus1280_sse2_enc)
503 /* load the state: */
504 movdqu (0 * 16)(%rdi), STATE0_LO
505 movdqu (1 * 16)(%rdi), STATE0_HI
506 movdqu (2 * 16)(%rdi), STATE1_LO
507 movdqu (3 * 16)(%rdi), STATE1_HI
508 movdqu (4 * 16)(%rdi), STATE2_LO
509 movdqu (5 * 16)(%rdi), STATE2_HI
510 movdqu (6 * 16)(%rdi), STATE3_LO
511 movdqu (7 * 16)(%rdi), STATE3_HI
512 movdqu (8 * 16)(%rdi), STATE4_LO
513 movdqu (9 * 16)(%rdi), STATE4_HI
522 movdqa 0(%rsi), MSG_LO
523 movdqa 16(%rsi), MSG_HI
524 movdqa STATE1_LO, T1_LO
525 movdqa STATE1_HI, T1_HI
531 pxor STATE0_LO, T0_LO
532 pxor STATE0_HI, T0_HI
533 movdqa STATE2_LO, T1_LO
534 movdqa STATE2_HI, T1_HI
535 pand STATE3_LO, T1_LO
536 pand STATE3_HI, T1_HI
539 movdqa T0_LO, 0(%rdx)
540 movdqa T0_HI, 16(%rdx)
542 call __morus1280_update
552 movdqu 0(%rsi), MSG_LO
553 movdqu 16(%rsi), MSG_HI
554 movdqa STATE1_LO, T1_LO
555 movdqa STATE1_HI, T1_HI
561 pxor STATE0_LO, T0_LO
562 pxor STATE0_HI, T0_HI
563 movdqa STATE2_LO, T1_LO
564 movdqa STATE2_HI, T1_HI
565 pand STATE3_LO, T1_LO
566 pand STATE3_HI, T1_HI
569 movdqu T0_LO, 0(%rdx)
570 movdqu T0_HI, 16(%rdx)
572 call __morus1280_update
580 /* store the state: */
581 movdqu STATE0_LO, (0 * 16)(%rdi)
582 movdqu STATE0_HI, (1 * 16)(%rdi)
583 movdqu STATE1_LO, (2 * 16)(%rdi)
584 movdqu STATE1_HI, (3 * 16)(%rdi)
585 movdqu STATE2_LO, (4 * 16)(%rdi)
586 movdqu STATE2_HI, (5 * 16)(%rdi)
587 movdqu STATE3_LO, (6 * 16)(%rdi)
588 movdqu STATE3_HI, (7 * 16)(%rdi)
589 movdqu STATE4_LO, (8 * 16)(%rdi)
590 movdqu STATE4_HI, (9 * 16)(%rdi)
595 ENDPROC(crypto_morus1280_sse2_enc)
598 * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
599 * unsigned int length);
601 ENTRY(crypto_morus1280_sse2_enc_tail)
604 /* load the state: */
605 movdqu (0 * 16)(%rdi), STATE0_LO
606 movdqu (1 * 16)(%rdi), STATE0_HI
607 movdqu (2 * 16)(%rdi), STATE1_LO
608 movdqu (3 * 16)(%rdi), STATE1_HI
609 movdqu (4 * 16)(%rdi), STATE2_LO
610 movdqu (5 * 16)(%rdi), STATE2_HI
611 movdqu (6 * 16)(%rdi), STATE3_LO
612 movdqu (7 * 16)(%rdi), STATE3_HI
613 movdqu (8 * 16)(%rdi), STATE4_LO
614 movdqu (9 * 16)(%rdi), STATE4_HI
616 /* encrypt message: */
619 movdqa STATE1_LO, T1_LO
620 movdqa STATE1_HI, T1_HI
626 pxor STATE0_LO, T0_LO
627 pxor STATE0_HI, T0_HI
628 movdqa STATE2_LO, T1_LO
629 movdqa STATE2_HI, T1_HI
630 pand STATE3_LO, T1_LO
631 pand STATE3_HI, T1_HI
637 call __morus1280_update
639 /* store the state: */
640 movdqu STATE0_LO, (0 * 16)(%rdi)
641 movdqu STATE0_HI, (1 * 16)(%rdi)
642 movdqu STATE1_LO, (2 * 16)(%rdi)
643 movdqu STATE1_HI, (3 * 16)(%rdi)
644 movdqu STATE2_LO, (4 * 16)(%rdi)
645 movdqu STATE2_HI, (5 * 16)(%rdi)
646 movdqu STATE3_LO, (6 * 16)(%rdi)
647 movdqu STATE3_HI, (7 * 16)(%rdi)
648 movdqu STATE4_LO, (8 * 16)(%rdi)
649 movdqu STATE4_HI, (9 * 16)(%rdi)
653 ENDPROC(crypto_morus1280_sse2_enc_tail)
656 * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
657 * unsigned int length);
659 ENTRY(crypto_morus1280_sse2_dec)
665 /* load the state: */
666 movdqu (0 * 16)(%rdi), STATE0_LO
667 movdqu (1 * 16)(%rdi), STATE0_HI
668 movdqu (2 * 16)(%rdi), STATE1_LO
669 movdqu (3 * 16)(%rdi), STATE1_HI
670 movdqu (4 * 16)(%rdi), STATE2_LO
671 movdqu (5 * 16)(%rdi), STATE2_HI
672 movdqu (6 * 16)(%rdi), STATE3_LO
673 movdqu (7 * 16)(%rdi), STATE3_HI
674 movdqu (8 * 16)(%rdi), STATE4_LO
675 movdqu (9 * 16)(%rdi), STATE4_HI
684 movdqa 0(%rsi), MSG_LO
685 movdqa 16(%rsi), MSG_HI
686 pxor STATE0_LO, MSG_LO
687 pxor STATE0_HI, MSG_HI
688 movdqa STATE1_LO, T1_LO
689 movdqa STATE1_HI, T1_HI
693 movdqa STATE2_LO, T1_LO
694 movdqa STATE2_HI, T1_HI
695 pand STATE3_LO, T1_LO
696 pand STATE3_HI, T1_HI
699 movdqa MSG_LO, 0(%rdx)
700 movdqa MSG_HI, 16(%rdx)
702 call __morus1280_update
712 movdqu 0(%rsi), MSG_LO
713 movdqu 16(%rsi), MSG_HI
714 pxor STATE0_LO, MSG_LO
715 pxor STATE0_HI, MSG_HI
716 movdqa STATE1_LO, T1_LO
717 movdqa STATE1_HI, T1_HI
721 movdqa STATE2_LO, T1_LO
722 movdqa STATE2_HI, T1_HI
723 pand STATE3_LO, T1_LO
724 pand STATE3_HI, T1_HI
727 movdqu MSG_LO, 0(%rdx)
728 movdqu MSG_HI, 16(%rdx)
730 call __morus1280_update
738 /* store the state: */
739 movdqu STATE0_LO, (0 * 16)(%rdi)
740 movdqu STATE0_HI, (1 * 16)(%rdi)
741 movdqu STATE1_LO, (2 * 16)(%rdi)
742 movdqu STATE1_HI, (3 * 16)(%rdi)
743 movdqu STATE2_LO, (4 * 16)(%rdi)
744 movdqu STATE2_HI, (5 * 16)(%rdi)
745 movdqu STATE3_LO, (6 * 16)(%rdi)
746 movdqu STATE3_HI, (7 * 16)(%rdi)
747 movdqu STATE4_LO, (8 * 16)(%rdi)
748 movdqu STATE4_HI, (9 * 16)(%rdi)
753 ENDPROC(crypto_morus1280_sse2_dec)
756 * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
757 * unsigned int length);
759 ENTRY(crypto_morus1280_sse2_dec_tail)
762 /* load the state: */
763 movdqu (0 * 16)(%rdi), STATE0_LO
764 movdqu (1 * 16)(%rdi), STATE0_HI
765 movdqu (2 * 16)(%rdi), STATE1_LO
766 movdqu (3 * 16)(%rdi), STATE1_HI
767 movdqu (4 * 16)(%rdi), STATE2_LO
768 movdqu (5 * 16)(%rdi), STATE2_HI
769 movdqu (6 * 16)(%rdi), STATE3_LO
770 movdqu (7 * 16)(%rdi), STATE3_HI
771 movdqu (8 * 16)(%rdi), STATE4_LO
772 movdqu (9 * 16)(%rdi), STATE4_HI
774 /* decrypt message: */
777 pxor STATE0_LO, MSG_LO
778 pxor STATE0_HI, MSG_HI
779 movdqa STATE1_LO, T1_LO
780 movdqa STATE1_HI, T1_HI
784 movdqa STATE2_LO, T1_LO
785 movdqa STATE2_HI, T1_HI
786 pand STATE3_LO, T1_LO
787 pand STATE3_HI, T1_HI
795 /* mask with byte count: */
797 punpcklbw T0_LO, T0_LO
798 punpcklbw T0_LO, T0_LO
799 punpcklbw T0_LO, T0_LO
800 punpcklbw T0_LO, T0_LO
802 movdqa .Lmorus640_counter_0, T1_LO
803 movdqa .Lmorus640_counter_1, T1_HI
809 call __morus1280_update
811 /* store the state: */
812 movdqu STATE0_LO, (0 * 16)(%rdi)
813 movdqu STATE0_HI, (1 * 16)(%rdi)
814 movdqu STATE1_LO, (2 * 16)(%rdi)
815 movdqu STATE1_HI, (3 * 16)(%rdi)
816 movdqu STATE2_LO, (4 * 16)(%rdi)
817 movdqu STATE2_HI, (5 * 16)(%rdi)
818 movdqu STATE3_LO, (6 * 16)(%rdi)
819 movdqu STATE3_HI, (7 * 16)(%rdi)
820 movdqu STATE4_LO, (8 * 16)(%rdi)
821 movdqu STATE4_HI, (9 * 16)(%rdi)
825 ENDPROC(crypto_morus1280_sse2_dec_tail)
828 * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
829 * u64 assoclen, u64 cryptlen);
831 ENTRY(crypto_morus1280_sse2_final)
834 /* load the state: */
835 movdqu (0 * 16)(%rdi), STATE0_LO
836 movdqu (1 * 16)(%rdi), STATE0_HI
837 movdqu (2 * 16)(%rdi), STATE1_LO
838 movdqu (3 * 16)(%rdi), STATE1_HI
839 movdqu (4 * 16)(%rdi), STATE2_LO
840 movdqu (5 * 16)(%rdi), STATE2_HI
841 movdqu (6 * 16)(%rdi), STATE3_LO
842 movdqu (7 * 16)(%rdi), STATE3_HI
843 movdqu (8 * 16)(%rdi), STATE4_LO
844 movdqu (9 * 16)(%rdi), STATE4_HI
846 /* xor state[0] into state[4]: */
847 pxor STATE0_LO, STATE4_LO
848 pxor STATE0_HI, STATE4_HI
850 /* prepare length block: */
855 psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
859 call __morus1280_update
860 call __morus1280_update
861 call __morus1280_update
862 call __morus1280_update
863 call __morus1280_update
864 call __morus1280_update
865 call __morus1280_update
866 call __morus1280_update
867 call __morus1280_update
868 call __morus1280_update
871 movdqu 0(%rsi), MSG_LO
872 movdqu 16(%rsi), MSG_HI
874 pxor STATE0_LO, MSG_LO
875 pxor STATE0_HI, MSG_HI
876 movdqa STATE1_LO, T0_LO
877 movdqa STATE1_HI, T0_HI
881 movdqa STATE2_LO, T0_LO
882 movdqa STATE2_HI, T0_HI
883 pand STATE3_LO, T0_LO
884 pand STATE3_HI, T0_HI
888 movdqu MSG_LO, 0(%rsi)
889 movdqu MSG_HI, 16(%rsi)
893 ENDPROC(crypto_morus1280_sse2_final)