2 # SPDX-License-Identifier: GPL-2.0
4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
5 # has relicensed it under the GPLv2. Therefore this program is free software;
6 # you can redistribute it and/or modify it under the terms of the GNU General
7 # Public License version 2 as published by the Free Software Foundation.
9 # The original headers, including the original license headers, are
10 # included below for completeness.
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # SHA256 block procedure for ARMv4. May 2007.
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
47 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
48 open STDOUT,">$output";
62 @V=($A,$B,$C,$D,$E,$F,$G,$H);
72 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
74 $code.=<<___ if ($i<16);
76 @ ldr $t1,[$inp],#4 @ $i
78 str $inp,[sp,#17*4] @ make room for $t4
80 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
87 @ ldrb $t1,[$inp,#3] @ $i
88 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
93 orr $t1,$t1,$t0,lsl#16
95 str $inp,[sp,#17*4] @ make room for $t4
97 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
98 orr $t1,$t1,$t2,lsl#24
99 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
103 ldr $t2,[$Ktbl],#4 @ *K256++
104 add $h,$h,$t1 @ h+=X[i]
105 str $t1,[sp,#`$i%16`*4]
107 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
109 add $h,$h,$t2 @ h+=K256[i]
110 eor $t1,$t1,$g @ Ch(e,f,g)
111 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
112 add $h,$h,$t1 @ h+=Ch(e,f,g)
115 cmp $t2,#0xf2 @ done?
119 ldr $t1,[$inp],#4 @ prefetch
123 eor $t2,$a,$b @ a^b, b^c in next round
125 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
126 eor $t2,$a,$b @ a^b, b^c in next round
127 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
129 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
130 and $t3,$t3,$t2 @ (b^c)&=(a^b)
132 eor $t3,$t3,$b @ Maj(a,b,c)
133 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
134 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
140 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
143 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
144 @ ldr $t4,[sp,#`($i+14)%16`*4]
145 mov $t0,$t1,ror#$sigma0[0]
146 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
147 mov $t2,$t4,ror#$sigma1[0]
148 eor $t0,$t0,$t1,ror#$sigma0[1]
149 eor $t2,$t2,$t4,ror#$sigma1[1]
150 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
151 ldr $t1,[sp,#`($i+0)%16`*4]
152 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
153 ldr $t4,[sp,#`($i+9)%16`*4]
156 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
158 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
159 add $t1,$t1,$t4 @ X[i]
166 # include "arm_arch.h"
168 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
169 # define __ARM_MAX_ARCH__ 7
188 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
189 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
190 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
191 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
192 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
193 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
194 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
195 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
196 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
197 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
198 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
199 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
200 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
201 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
208 .word OPENSSL_armcap_P-sha256_block_data_order
212 .global sha256_block_data_order
213 .type sha256_block_data_order,%function
214 sha256_block_data_order:
216 sub r3,pc,#8 @ sha256_block_data_order
218 adr r3,sha256_block_data_order
220 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221 ldr r12,.LOPENSSL_armcap
222 ldr r12,[r3,r12] @ OPENSSL_armcap_P
223 tst r12,#ARMV8_SHA256
228 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
229 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
230 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
231 sub $Ktbl,r3,#256+32 @ K256
232 sub sp,sp,#16*4 @ alloca(X[16])
239 eor $t3,$B,$C @ magic
242 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
243 $code.=".Lrounds_16_xx:\n";
244 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
247 ite eq @ Thumb2 thing, sanity check in ARM
249 ldreq $t3,[sp,#16*4] @ pull ctx
252 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
267 ldr $inp,[sp,#17*4] @ pull inp
268 ldr $t2,[sp,#18*4] @ pull inp+len
271 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
273 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
276 add sp,sp,#`16+3`*4 @ destroy frame
278 ldmia sp!,{r4-r11,pc}
280 ldmia sp!,{r4-r11,lr}
282 moveq pc,lr @ be binary compatible with V4, yet
283 bx lr @ interoperable with Thumb ISA:-)
285 .size sha256_block_data_order,.-sha256_block_data_order
287 ######################################################################
291 my @X=map("q$_",(0..3));
292 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
296 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
297 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
299 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
300 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
302 $arg = "#$arg" if ($arg*1 eq $arg);
303 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
309 my @insns = (&$body,&$body,&$body,&$body);
310 my ($a,$b,$c,$d,$e,$f,$g,$h);
312 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
316 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
320 &vshr_u32 ($T2,$T0,$sigma0[0]);
323 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
326 &vshr_u32 ($T1,$T0,$sigma0[2]);
329 &vsli_32 ($T2,$T0,32-$sigma0[0]);
332 &vshr_u32 ($T3,$T0,$sigma0[1]);
338 &vsli_32 ($T3,$T0,32-$sigma0[1]);
341 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
344 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
347 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
350 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
353 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
359 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
362 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
365 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
368 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
371 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
374 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
377 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
383 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
386 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
389 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
392 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
395 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
398 &vadd_i32 ($T0,$T0,@X[0]);
399 while($#insns>=2) { eval(shift(@insns)); }
400 &vst1_32 ("{$T0}","[$Xfer,:128]!");
404 push(@X,shift(@X)); # "rotate" X[]
410 my @insns = (&$body,&$body,&$body,&$body);
411 my ($a,$b,$c,$d,$e,$f,$g,$h);
417 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
422 &vrev32_8 (@X[0],@X[0]);
427 &vadd_i32 ($T0,$T0,@X[0]);
428 foreach (@insns) { eval; } # remaining instructions
429 &vst1_32 ("{$T0}","[$Xfer,:128]!");
431 push(@X,shift(@X)); # "rotate" X[]
436 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
437 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
439 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
440 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
442 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
443 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
444 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
445 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
446 '&eor ($t2,$a,$b)', # a^b, b^c in next round
447 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
448 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
449 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
450 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
451 '&ldr ($t1,"[sp,#64]") if ($j==31)',
452 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
453 '&add ($d,$d,$h)', # d+=h
454 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
455 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
456 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
461 #if __ARM_MAX_ARCH__>=7
465 .global sha256_block_data_order_neon
466 .type sha256_block_data_order_neon,%function
468 sha256_block_data_order_neon:
470 stmdb sp!,{r4-r12,lr}
474 bic $H,$H,#15 @ align for 128-bit stores
477 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
479 vld1.8 {@X[0]},[$inp]!
480 vld1.8 {@X[1]},[$inp]!
481 vld1.8 {@X[2]},[$inp]!
482 vld1.8 {@X[3]},[$inp]!
483 vld1.32 {$T0},[$Ktbl,:128]!
484 vld1.32 {$T1},[$Ktbl,:128]!
485 vld1.32 {$T2},[$Ktbl,:128]!
486 vld1.32 {$T3},[$Ktbl,:128]!
487 vrev32.8 @X[0],@X[0] @ yes, even on
489 vrev32.8 @X[1],@X[1] @ big-endian
495 str $t2,[sp,#76] @ save original sp
496 vadd.i32 $T0,$T0,@X[0]
497 vadd.i32 $T1,$T1,@X[1]
498 vst1.32 {$T0},[$Xfer,:128]!
499 vadd.i32 $T2,$T2,@X[2]
500 vst1.32 {$T1},[$Xfer,:128]!
501 vadd.i32 $T3,$T3,@X[3]
502 vst1.32 {$T2},[$Xfer,:128]!
503 vst1.32 {$T3},[$Xfer,:128]!
515 &Xupdate(\&body_00_15);
516 &Xupdate(\&body_00_15);
517 &Xupdate(\&body_00_15);
518 &Xupdate(\&body_00_15);
520 teq $t1,#0 @ check for K256 terminator
527 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
530 subeq $inp,$inp,#64 @ avoid SEGV
531 vld1.8 {@X[0]},[$inp]! @ load next input block
532 vld1.8 {@X[1]},[$inp]!
533 vld1.8 {@X[2]},[$inp]!
534 vld1.8 {@X[3]},[$inp]!
539 &Xpreload(\&body_00_15);
540 &Xpreload(\&body_00_15);
541 &Xpreload(\&body_00_15);
542 &Xpreload(\&body_00_15);
545 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
549 add $A,$A,$t0 @ accumulate
571 ldreq sp,[sp,#76] @ restore original sp
576 ldmia sp!,{r4-r12,pc}
577 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
581 ######################################################################
585 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
586 my @MSG=map("q$_",(8..11));
587 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
591 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
594 # define INST(a,b,c,d) .byte c,d|0xc,a,b
596 # define INST(a,b,c,d) .byte a,b,c,d
599 .type sha256_block_data_order_armv8,%function
601 sha256_block_data_order_armv8:
603 vld1.32 {$ABCD,$EFGH},[$ctx]
606 sub $Ktbl,$Ktbl,#.LARMv8-K256
610 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
613 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
614 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
615 vld1.32 {$W0},[$Ktbl]!
616 vrev32.8 @MSG[0],@MSG[0]
617 vrev32.8 @MSG[1],@MSG[1]
618 vrev32.8 @MSG[2],@MSG[2]
619 vrev32.8 @MSG[3],@MSG[3]
620 vmov $ABCD_SAVE,$ABCD @ offload
621 vmov $EFGH_SAVE,$EFGH
624 for($i=0;$i<12;$i++) {
626 vld1.32 {$W1},[$Ktbl]!
627 vadd.i32 $W0,$W0,@MSG[0]
628 sha256su0 @MSG[0],@MSG[1]
630 sha256h $ABCD,$EFGH,$W0
631 sha256h2 $EFGH,$abcd,$W0
632 sha256su1 @MSG[0],@MSG[2],@MSG[3]
634 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
637 vld1.32 {$W1},[$Ktbl]!
638 vadd.i32 $W0,$W0,@MSG[0]
640 sha256h $ABCD,$EFGH,$W0
641 sha256h2 $EFGH,$abcd,$W0
643 vld1.32 {$W0},[$Ktbl]!
644 vadd.i32 $W1,$W1,@MSG[1]
646 sha256h $ABCD,$EFGH,$W1
647 sha256h2 $EFGH,$abcd,$W1
649 vld1.32 {$W1},[$Ktbl]
650 vadd.i32 $W0,$W0,@MSG[2]
651 sub $Ktbl,$Ktbl,#256-16 @ rewind
653 sha256h $ABCD,$EFGH,$W0
654 sha256h2 $EFGH,$abcd,$W0
656 vadd.i32 $W1,$W1,@MSG[3]
658 sha256h $ABCD,$EFGH,$W1
659 sha256h2 $EFGH,$abcd,$W1
661 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
662 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
666 vst1.32 {$ABCD,$EFGH},[$ctx]
669 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
674 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
676 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
677 .comm OPENSSL_armcap_P,4,4
684 last if (!s/^#/@/ and !/^$/);
690 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
691 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
694 my ($mnemonic,$arg)=@_;
696 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
697 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
698 |(($2&7)<<17)|(($2&8)<<4)
699 |(($3&7)<<1) |(($3&8)<<2);
700 # since ARMv7 instructions are always encoded little-endian.
701 # correct solution is to use .inst directive, but older
702 # assemblers don't implement it:-(
703 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
704 $word&0xff,($word>>8)&0xff,
705 ($word>>16)&0xff,($word>>24)&0xff,
711 foreach (split($/,$code)) {
713 s/\`([^\`]*)\`/eval $1/geo;
715 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
717 s/\bret\b/bx lr/go or
718 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
723 close STDOUT; # enforce flush