Home | History | Annotate | Line # | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the Apache License 2.0 (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 # ====================================================================
     10 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     11 # project. The module is, however, dual licensed under OpenSSL and
     12 # CRYPTOGAMS licenses depending on where you obtain it. For further
     13 # details see http://www.openssl.org/~appro/cryptogams/.
     14 #
     15 # Permission to use under GPLv2 terms is granted.
     16 # ====================================================================
     17 #
     18 # SHA256/512 for ARMv8.
     19 #
     20 # Performance in cycles per processed byte and improvement coefficient
     21 # over code generated with "default" compiler:
     22 #
     23 #		SHA256-hw	SHA256(*)	SHA512
     24 # Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
     25 # Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
     26 # Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
     27 # Denver	2.01		10.5 (+26%)	6.70 (+8%)
     28 # X-Gene			20.0 (+100%)	12.8 (+300%(***))
     29 # Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
     30 # Kryo		1.92		17.4 (+30%)	11.2 (+8%)
     31 # ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
     32 #
     33 # (*)	Software SHA256 results are of lesser relevance, presented
     34 #	mostly for informational purposes.
     35 # (**)	The result is a trade-off: it's possible to improve it by
     36 #	10% (or by 1 cycle per round), but at the cost of 20% loss
     37 #	on Cortex-A53 (or by 4 cycles per round).
     38 # (***)	Super-impressive coefficients over gcc-generated code are
     39 #	indication of some compiler "pathology", most notably code
     40 #	generated with -mgeneral-regs-only is significantly faster
     41 #	and the gap is only 40-90%.
     42 #
     43 # October 2016.
     44 #
     45 # Originally it was reckoned that it makes no sense to implement NEON
     46 # version of SHA256 for 64-bit processors. This is because performance
     47 # improvement on most wide-spread Cortex-A5x processors was observed
     48 # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
     49 # observed that 32-bit NEON SHA256 performs significantly better than
     50 # 64-bit scalar version on *some* of the more recent processors. As
     51 # result 64-bit NEON version of SHA256 was added to provide best
     52 # all-round performance. For example it executes ~30% faster on X-Gene
     53 # and Mongoose. [For reference, NEON version of SHA512 is bound to
     54 # deliver much less improvement, likely *negative* on Cortex-A5x.
     55 # Which is why NEON support is limited to SHA256.]
     56 
     57 # $output is the last argument if it looks like a file (it has an extension)
     58 # $flavour is the first argument if it doesn't look like a file
     59 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
     60 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
     61 
     62 if ($flavour && $flavour ne "void") {
     63     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     64     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     65     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     66     die "can't locate arm-xlate.pl";
     67 
     68     open OUT,"| \"$^X\" $xlate $flavour \"$output\""
     69         or die "can't call $xlate: $!";
     70     *STDOUT=*OUT;
     71 } else {
     72     $output and open STDOUT,">$output";
     73 }
     74 
     75 if ($output =~ /512/) {
     76 	$BITS=512;
     77 	$SZ=8;
     78 	@Sigma0=(28,34,39);
     79 	@Sigma1=(14,18,41);
     80 	@sigma0=(1,  8, 7);
     81 	@sigma1=(19,61, 6);
     82 	$rounds=80;
     83 	$reg_t="x";
     84 } else {
     85 	$BITS=256;
     86 	$SZ=4;
     87 	@Sigma0=( 2,13,22);
     88 	@Sigma1=( 6,11,25);
     89 	@sigma0=( 7,18, 3);
     90 	@sigma1=(17,19,10);
     91 	$rounds=64;
     92 	$reg_t="w";
     93 }
     94 
     95 $func="sha${BITS}_block_data_order";
     96 
     97 ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
     98 
     99 @X=map("$reg_t$_",(3..15,0..2));
    100 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
    101 ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
    102 
    103 sub BODY_00_xx {
    104 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
    105 my $j=($i+1)&15;
    106 my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
    107    $T0=@X[$i+3] if ($i<11);
    108 
    109 $code.=<<___	if ($i<16);
    110 #ifndef	__AARCH64EB__
    111 	rev	@X[$i],@X[$i]			// $i
    112 #endif
    113 ___
    114 $code.=<<___	if ($i<13 && ($i&1));
    115 	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
    116 ___
    117 $code.=<<___	if ($i==13);
    118 	ldp	@X[14],@X[15],[$inp]
    119 ___
    120 $code.=<<___	if ($i>=14);
    121 	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
    122 ___
    123 $code.=<<___	if ($i>0 && $i<16);
    124 	add	$a,$a,$t1			// h+=Sigma0(a)
    125 ___
    126 $code.=<<___	if ($i>=11);
    127 	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
    128 ___
    129 # While ARMv8 specifies merged rotate-n-logical operation such as
    130 # 'eor x,y,z,ror#n', it was found to negatively affect performance
    131 # on Apple A7. The reason seems to be that it requires even 'y' to
    132 # be available earlier. This means that such merged instruction is
    133 # not necessarily best choice on critical path... On the other hand
    134 # Cortex-A5x handles merged instructions much better than disjoint
    135 # rotate and logical... See (**) footnote above.
    136 $code.=<<___	if ($i<15);
    137 	ror	$t0,$e,#$Sigma1[0]
    138 	add	$h,$h,$t2			// h+=K[i]
    139 	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
    140 	and	$t1,$f,$e
    141 	bic	$t2,$g,$e
    142 	add	$h,$h,@X[$i&15]			// h+=X[i]
    143 	orr	$t1,$t1,$t2			// Ch(e,f,g)
    144 	eor	$t2,$a,$b			// a^b, b^c in next round
    145 	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
    146 	ror	$T0,$a,#$Sigma0[0]
    147 	add	$h,$h,$t1			// h+=Ch(e,f,g)
    148 	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
    149 	add	$h,$h,$t0			// h+=Sigma1(e)
    150 	and	$t3,$t3,$t2			// (b^c)&=(a^b)
    151 	add	$d,$d,$h			// d+=h
    152 	eor	$t3,$t3,$b			// Maj(a,b,c)
    153 	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
    154 	add	$h,$h,$t3			// h+=Maj(a,b,c)
    155 	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
    156 	//add	$h,$h,$t1			// h+=Sigma0(a)
    157 ___
    158 $code.=<<___	if ($i>=15);
    159 	ror	$t0,$e,#$Sigma1[0]
    160 	add	$h,$h,$t2			// h+=K[i]
    161 	ror	$T1,@X[($j+1)&15],#$sigma0[0]
    162 	and	$t1,$f,$e
    163 	ror	$T2,@X[($j+14)&15],#$sigma1[0]
    164 	bic	$t2,$g,$e
    165 	ror	$T0,$a,#$Sigma0[0]
    166 	add	$h,$h,@X[$i&15]			// h+=X[i]
    167 	eor	$t0,$t0,$e,ror#$Sigma1[1]
    168 	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
    169 	orr	$t1,$t1,$t2			// Ch(e,f,g)
    170 	eor	$t2,$a,$b			// a^b, b^c in next round
    171 	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
    172 	eor	$T0,$T0,$a,ror#$Sigma0[1]
    173 	add	$h,$h,$t1			// h+=Ch(e,f,g)
    174 	and	$t3,$t3,$t2			// (b^c)&=(a^b)
    175 	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
    176 	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
    177 	add	$h,$h,$t0			// h+=Sigma1(e)
    178 	eor	$t3,$t3,$b			// Maj(a,b,c)
    179 	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
    180 	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
    181 	add	@X[$j],@X[$j],@X[($j+9)&15]
    182 	add	$d,$d,$h			// d+=h
    183 	add	$h,$h,$t3			// h+=Maj(a,b,c)
    184 	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
    185 	add	@X[$j],@X[$j],$T1
    186 	add	$h,$h,$t1			// h+=Sigma0(a)
    187 	add	@X[$j],@X[$j],$T2
    188 ___
    189 	($t2,$t3)=($t3,$t2);
    190 }
    191 
    192 $code.=<<___;
    193 #ifndef	__KERNEL__
    194 # include "arm_arch.h"
    195 .extern	OPENSSL_armcap_P
    196 .hidden	OPENSSL_armcap_P
    197 #endif
    198 
    199 .text
    200 
    201 .globl	$func
    202 .type	$func,%function
    203 .align	6
    204 $func:
    205 #ifndef	__KERNEL__
    206 	adrp	x16,OPENSSL_armcap_P
    207 	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
    208 ___
    209 $code.=<<___	if ($SZ==4);
    210 	tst	w16,#ARMV8_SHA256
    211 	b.ne	.Lv8_entry
    212 	tst	w16,#ARMV7_NEON
    213 	b.ne	.Lneon_entry
    214 ___
    215 $code.=<<___	if ($SZ==8);
    216 	tst	w16,#ARMV8_SHA512
    217 	b.ne	.Lv8_entry
    218 ___
    219 $code.=<<___;
    220 #endif
    221 	.inst	0xd503233f				// paciasp
    222 	stp	x29,x30,[sp,#-128]!
    223 	add	x29,sp,#0
    224 
    225 	stp	x19,x20,[sp,#16]
    226 	stp	x21,x22,[sp,#32]
    227 	stp	x23,x24,[sp,#48]
    228 	stp	x25,x26,[sp,#64]
    229 	stp	x27,x28,[sp,#80]
    230 	sub	sp,sp,#4*$SZ
    231 
    232 	ldp	$A,$B,[$ctx]				// load context
    233 	ldp	$C,$D,[$ctx,#2*$SZ]
    234 	ldp	$E,$F,[$ctx,#4*$SZ]
    235 	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
    236 	ldp	$G,$H,[$ctx,#6*$SZ]
    237 	adr	$Ktbl,.LK$BITS
    238 	stp	$ctx,$num,[x29,#96]
    239 
    240 .Loop:
    241 	ldp	@X[0],@X[1],[$inp],#2*$SZ
    242 	ldr	$t2,[$Ktbl],#$SZ			// *K++
    243 	eor	$t3,$B,$C				// magic seed
    244 	str	$inp,[x29,#112]
    245 ___
    246 for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
    247 $code.=".Loop_16_xx:\n";
    248 for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
    249 $code.=<<___;
    250 	cbnz	$t2,.Loop_16_xx
    251 
    252 	ldp	$ctx,$num,[x29,#96]
    253 	ldr	$inp,[x29,#112]
    254 	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
    255 
    256 	ldp	@X[0],@X[1],[$ctx]
    257 	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
    258 	add	$inp,$inp,#14*$SZ			// advance input pointer
    259 	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
    260 	add	$A,$A,@X[0]
    261 	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
    262 	add	$B,$B,@X[1]
    263 	add	$C,$C,@X[2]
    264 	add	$D,$D,@X[3]
    265 	stp	$A,$B,[$ctx]
    266 	add	$E,$E,@X[4]
    267 	add	$F,$F,@X[5]
    268 	stp	$C,$D,[$ctx,#2*$SZ]
    269 	add	$G,$G,@X[6]
    270 	add	$H,$H,@X[7]
    271 	cmp	$inp,$num
    272 	stp	$E,$F,[$ctx,#4*$SZ]
    273 	stp	$G,$H,[$ctx,#6*$SZ]
    274 	b.ne	.Loop
    275 
    276 	ldp	x19,x20,[x29,#16]
    277 	add	sp,sp,#4*$SZ
    278 	ldp	x21,x22,[x29,#32]
    279 	ldp	x23,x24,[x29,#48]
    280 	ldp	x25,x26,[x29,#64]
    281 	ldp	x27,x28,[x29,#80]
    282 	ldp	x29,x30,[sp],#128
    283 	.inst	0xd50323bf				// autiasp
    284 	ret
    285 .size	$func,.-$func
    286 
    287 .align	6
    288 .type	.LK$BITS,%object
    289 .LK$BITS:
    290 ___
    291 $code.=<<___ if ($SZ==8);
    292 	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
    293 	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
    294 	.quad	0x3956c25bf348b538,0x59f111f1b605d019
    295 	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
    296 	.quad	0xd807aa98a3030242,0x12835b0145706fbe
    297 	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
    298 	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
    299 	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
    300 	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
    301 	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
    302 	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
    303 	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
    304 	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
    305 	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
    306 	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
    307 	.quad	0x06ca6351e003826f,0x142929670a0e6e70
    308 	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
    309 	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
    310 	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
    311 	.quad	0x81c2c92e47edaee6,0x92722c851482353b
    312 	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
    313 	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
    314 	.quad	0xd192e819d6ef5218,0xd69906245565a910
    315 	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
    316 	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
    317 	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
    318 	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
    319 	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
    320 	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
    321 	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
    322 	.quad	0x90befffa23631e28,0xa4506cebde82bde9
    323 	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
    324 	.quad	0xca273eceea26619c,0xd186b8c721c0c207
    325 	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
    326 	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
    327 	.quad	0x113f9804bef90dae,0x1b710b35131c471b
    328 	.quad	0x28db77f523047d84,0x32caab7b40c72493
    329 	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
    330 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
    331 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
    332 	.quad	0	// terminator
    333 ___
    334 $code.=<<___ if ($SZ==4);
    335 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
    336 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
    337 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
    338 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
    339 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
    340 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
    341 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
    342 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
    343 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
    344 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
    345 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
    346 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
    347 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
    348 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
    349 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
    350 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    351 	.long	0	//terminator
    352 ___
    353 $code.=<<___;
    354 .size	.LK$BITS,.-.LK$BITS
    355 .asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
    356 .align	2
    357 ___
    358 
    359 if ($SZ==4) {
    360 my $Ktbl="x3";
    361 
    362 my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
    363 my @MSG=map("v$_.16b",(4..7));
    364 my ($W0,$W1)=("v16.4s","v17.4s");
    365 my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
    366 
    367 $code.=<<___;
    368 #ifndef	__KERNEL__
    369 .type	sha256_block_armv8,%function
    370 .align	6
    371 sha256_block_armv8:
    372 .Lv8_entry:
    373 	stp		x29,x30,[sp,#-16]!
    374 	add		x29,sp,#0
    375 
    376 	ld1.32		{$ABCD,$EFGH},[$ctx]
    377 	adr		$Ktbl,.LK256
    378 
    379 .Loop_hw:
    380 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
    381 	sub		$num,$num,#1
    382 	ld1.32		{$W0},[$Ktbl],#16
    383 	rev32		@MSG[0],@MSG[0]
    384 	rev32		@MSG[1],@MSG[1]
    385 	rev32		@MSG[2],@MSG[2]
    386 	rev32		@MSG[3],@MSG[3]
    387 	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
    388 	orr		$EFGH_SAVE,$EFGH,$EFGH
    389 ___
    390 for($i=0;$i<12;$i++) {
    391 $code.=<<___;
    392 	ld1.32		{$W1},[$Ktbl],#16
    393 	add.i32		$W0,$W0,@MSG[0]
    394 	sha256su0	@MSG[0],@MSG[1]
    395 	orr		$abcd,$ABCD,$ABCD
    396 	sha256h		$ABCD,$EFGH,$W0
    397 	sha256h2	$EFGH,$abcd,$W0
    398 	sha256su1	@MSG[0],@MSG[2],@MSG[3]
    399 ___
    400 	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    401 }
    402 $code.=<<___;
    403 	ld1.32		{$W1},[$Ktbl],#16
    404 	add.i32		$W0,$W0,@MSG[0]
    405 	orr		$abcd,$ABCD,$ABCD
    406 	sha256h		$ABCD,$EFGH,$W0
    407 	sha256h2	$EFGH,$abcd,$W0
    408 
    409 	ld1.32		{$W0},[$Ktbl],#16
    410 	add.i32		$W1,$W1,@MSG[1]
    411 	orr		$abcd,$ABCD,$ABCD
    412 	sha256h		$ABCD,$EFGH,$W1
    413 	sha256h2	$EFGH,$abcd,$W1
    414 
    415 	ld1.32		{$W1},[$Ktbl]
    416 	add.i32		$W0,$W0,@MSG[2]
    417 	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
    418 	orr		$abcd,$ABCD,$ABCD
    419 	sha256h		$ABCD,$EFGH,$W0
    420 	sha256h2	$EFGH,$abcd,$W0
    421 
    422 	add.i32		$W1,$W1,@MSG[3]
    423 	orr		$abcd,$ABCD,$ABCD
    424 	sha256h		$ABCD,$EFGH,$W1
    425 	sha256h2	$EFGH,$abcd,$W1
    426 
    427 	add.i32		$ABCD,$ABCD,$ABCD_SAVE
    428 	add.i32		$EFGH,$EFGH,$EFGH_SAVE
    429 
    430 	cbnz		$num,.Loop_hw
    431 
    432 	st1.32		{$ABCD,$EFGH},[$ctx]
    433 
    434 	ldr		x29,[sp],#16
    435 	ret
    436 .size	sha256_block_armv8,.-sha256_block_armv8
    437 #endif
    438 ___
    439 }
    440 
    441 if ($SZ==4) {	######################################### NEON stuff #
    442 # You'll surely note a lot of similarities with sha256-armv4 module,
    443 # and of course it's not a coincidence. sha256-armv4 was used as
    444 # initial template, but was adapted for ARMv8 instruction set and
    445 # extensively re-tuned for all-round performance.
    446 
    447 my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
    448 my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
    449 my $Ktbl="x16";
    450 my $Xfer="x17";
    451 my @X = map("q$_",(0..3));
    452 my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
    453 my $j=0;
    454 
    455 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
    456 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
    457   my $arg = pop;
    458     $arg = "#$arg" if ($arg*1 eq $arg);
    459     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
    460 }
    461 
    462 sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
    463 sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
    464 sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
    465 
    466 sub Xupdate()
    467 { use integer;
    468   my $body = shift;
    469   my @insns = (&$body,&$body,&$body,&$body);
    470   my ($a,$b,$c,$d,$e,$f,$g,$h);
    471 
    472 	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
    473 	 eval(shift(@insns));
    474 	 eval(shift(@insns));
    475 	 eval(shift(@insns));
    476 	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
    477 	 eval(shift(@insns));
    478 	 eval(shift(@insns));
    479 	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
    480 	 eval(shift(@insns));
    481 	 eval(shift(@insns));
    482 	&ushr_32	($T2,$T0,$sigma0[0]);
    483 	 eval(shift(@insns));
    484 	&ushr_32	($T1,$T0,$sigma0[2]);
    485 	 eval(shift(@insns));
    486 	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
    487 	 eval(shift(@insns));
    488 	&sli_32		($T2,$T0,32-$sigma0[0]);
    489 	 eval(shift(@insns));
    490 	 eval(shift(@insns));
    491 	&ushr_32	($T3,$T0,$sigma0[1]);
    492 	 eval(shift(@insns));
    493 	 eval(shift(@insns));
    494 	&eor_8		($T1,$T1,$T2);
    495 	 eval(shift(@insns));
    496 	 eval(shift(@insns));
    497 	&sli_32		($T3,$T0,32-$sigma0[1]);
    498 	 eval(shift(@insns));
    499 	 eval(shift(@insns));
    500 	  &ushr_32	($T4,$T7,$sigma1[0]);
    501 	 eval(shift(@insns));
    502 	 eval(shift(@insns));
    503 	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
    504 	 eval(shift(@insns));
    505 	 eval(shift(@insns));
    506 	  &sli_32	($T4,$T7,32-$sigma1[0]);
    507 	 eval(shift(@insns));
    508 	 eval(shift(@insns));
    509 	  &ushr_32	($T5,$T7,$sigma1[2]);
    510 	 eval(shift(@insns));
    511 	 eval(shift(@insns));
    512 	  &ushr_32	($T3,$T7,$sigma1[1]);
    513 	 eval(shift(@insns));
    514 	 eval(shift(@insns));
    515 	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
    516 	 eval(shift(@insns));
    517 	 eval(shift(@insns));
    518 	  &sli_u32	($T3,$T7,32-$sigma1[1]);
    519 	 eval(shift(@insns));
    520 	 eval(shift(@insns));
    521 	  &eor_8	($T5,$T5,$T4);
    522 	 eval(shift(@insns));
    523 	 eval(shift(@insns));
    524 	 eval(shift(@insns));
    525 	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
    526 	 eval(shift(@insns));
    527 	 eval(shift(@insns));
    528 	 eval(shift(@insns));
    529 	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
    530 	 eval(shift(@insns));
    531 	 eval(shift(@insns));
    532 	 eval(shift(@insns));
    533 	  &ushr_32	($T6,@X[0],$sigma1[0]);
    534 	 eval(shift(@insns));
    535 	  &ushr_32	($T7,@X[0],$sigma1[2]);
    536 	 eval(shift(@insns));
    537 	 eval(shift(@insns));
    538 	  &sli_32	($T6,@X[0],32-$sigma1[0]);
    539 	 eval(shift(@insns));
    540 	  &ushr_32	($T5,@X[0],$sigma1[1]);
    541 	 eval(shift(@insns));
    542 	 eval(shift(@insns));
    543 	  &eor_8	($T7,$T7,$T6);
    544 	 eval(shift(@insns));
    545 	 eval(shift(@insns));
    546 	  &sli_32	($T5,@X[0],32-$sigma1[1]);
    547 	 eval(shift(@insns));
    548 	 eval(shift(@insns));
    549 	&ld1_32		("{$T0}","[$Ktbl], #16");
    550 	 eval(shift(@insns));
    551 	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
    552 	 eval(shift(@insns));
    553 	 eval(shift(@insns));
    554 	&eor_8		($T5,$T5,$T5);
    555 	 eval(shift(@insns));
    556 	 eval(shift(@insns));
    557 	&mov		(&Dhi($T5), &Dlo($T7));
    558 	 eval(shift(@insns));
    559 	 eval(shift(@insns));
    560 	 eval(shift(@insns));
    561 	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
    562 	 eval(shift(@insns));
    563 	 eval(shift(@insns));
    564 	 eval(shift(@insns));
    565 	&add_32		($T0,$T0,@X[0]);
    566 	 while($#insns>=1) { eval(shift(@insns)); }
    567 	&st1_32		("{$T0}","[$Xfer], #16");
    568 	 eval(shift(@insns));
    569 
    570 	push(@X,shift(@X));		# "rotate" X[]
    571 }
    572 
    573 sub Xpreload()
    574 { use integer;
    575   my $body = shift;
    576   my @insns = (&$body,&$body,&$body,&$body);
    577   my ($a,$b,$c,$d,$e,$f,$g,$h);
    578 
    579 	 eval(shift(@insns));
    580 	 eval(shift(@insns));
    581 	&ld1_8		("{@X[0]}","[$inp],#16");
    582 	 eval(shift(@insns));
    583 	 eval(shift(@insns));
    584 	&ld1_32		("{$T0}","[$Ktbl],#16");
    585 	 eval(shift(@insns));
    586 	 eval(shift(@insns));
    587 	 eval(shift(@insns));
    588 	 eval(shift(@insns));
    589 	&rev32		(@X[0],@X[0]);
    590 	 eval(shift(@insns));
    591 	 eval(shift(@insns));
    592 	 eval(shift(@insns));
    593 	 eval(shift(@insns));
    594 	&add_32		($T0,$T0,@X[0]);
    595 	 foreach (@insns) { eval; }	# remaining instructions
    596 	&st1_32		("{$T0}","[$Xfer], #16");
    597 
    598 	push(@X,shift(@X));		# "rotate" X[]
    599 }
    600 
    601 sub body_00_15 () {
    602 	(
    603 	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
    604 	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
    605 	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
    606 	'&and	($t1,$f,$e)',
    607 	'&bic	($t4,$g,$e)',
    608 	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
    609 	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
    610 	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
    611 	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
    612 	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
    613 	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
    614 	'&ror	($t0,$t0,"#$Sigma1[0]")',
    615 	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
    616 	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
    617 	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
    618 	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
    619 	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
    620 	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
    621 	'&ror	($t4,$t4,"#$Sigma0[0]")',
    622 	'&add	($d,$d,$h)',			# d+=h
    623 	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
    624 	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
    625 	)
    626 }
    627 
    628 $code.=<<___;
    629 #ifdef	__KERNEL__
    630 .globl	sha256_block_neon
    631 #endif
    632 .type	sha256_block_neon,%function
    633 .align	4
    634 sha256_block_neon:
    635 .Lneon_entry:
    636 	stp	x29, x30, [sp, #-16]!
    637 	mov	x29, sp
    638 	sub	sp,sp,#16*4
    639 
    640 	adr	$Ktbl,.LK256
    641 	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
    642 
    643 	ld1.8	{@X[0]},[$inp], #16
    644 	ld1.8	{@X[1]},[$inp], #16
    645 	ld1.8	{@X[2]},[$inp], #16
    646 	ld1.8	{@X[3]},[$inp], #16
    647 	ld1.32	{$T0},[$Ktbl], #16
    648 	ld1.32	{$T1},[$Ktbl], #16
    649 	ld1.32	{$T2},[$Ktbl], #16
    650 	ld1.32	{$T3},[$Ktbl], #16
    651 	rev32	@X[0],@X[0]		// yes, even on
    652 	rev32	@X[1],@X[1]		// big-endian
    653 	rev32	@X[2],@X[2]
    654 	rev32	@X[3],@X[3]
    655 	mov	$Xfer,sp
    656 	add.32	$T0,$T0,@X[0]
    657 	add.32	$T1,$T1,@X[1]
    658 	add.32	$T2,$T2,@X[2]
    659 	st1.32	{$T0-$T1},[$Xfer], #32
    660 	add.32	$T3,$T3,@X[3]
    661 	st1.32	{$T2-$T3},[$Xfer]
    662 	sub	$Xfer,$Xfer,#32
    663 
    664 	ldp	$A,$B,[$ctx]
    665 	ldp	$C,$D,[$ctx,#8]
    666 	ldp	$E,$F,[$ctx,#16]
    667 	ldp	$G,$H,[$ctx,#24]
    668 	ldr	$t1,[sp,#0]
    669 	mov	$t2,wzr
    670 	eor	$t3,$B,$C
    671 	mov	$t4,wzr
    672 	b	.L_00_48
    673 
    674 .align	4
    675 .L_00_48:
    676 ___
    677 	&Xupdate(\&body_00_15);
    678 	&Xupdate(\&body_00_15);
    679 	&Xupdate(\&body_00_15);
    680 	&Xupdate(\&body_00_15);
    681 $code.=<<___;
    682 	cmp	$t1,#0				// check for K256 terminator
    683 	ldr	$t1,[sp,#0]
    684 	sub	$Xfer,$Xfer,#64
    685 	bne	.L_00_48
    686 
    687 	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
    688 	cmp	$inp,$num
    689 	mov	$Xfer, #64
    690 	csel	$Xfer, $Xfer, xzr, eq
    691 	sub	$inp,$inp,$Xfer			// avoid SEGV
    692 	mov	$Xfer,sp
    693 ___
    694 	&Xpreload(\&body_00_15);
    695 	&Xpreload(\&body_00_15);
    696 	&Xpreload(\&body_00_15);
    697 	&Xpreload(\&body_00_15);
    698 $code.=<<___;
    699 	add	$A,$A,$t4			// h+=Sigma0(a) from the past
    700 	ldp	$t0,$t1,[$ctx,#0]
    701 	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
    702 	ldp	$t2,$t3,[$ctx,#8]
    703 	add	$A,$A,$t0			// accumulate
    704 	add	$B,$B,$t1
    705 	ldp	$t0,$t1,[$ctx,#16]
    706 	add	$C,$C,$t2
    707 	add	$D,$D,$t3
    708 	ldp	$t2,$t3,[$ctx,#24]
    709 	add	$E,$E,$t0
    710 	add	$F,$F,$t1
    711 	 ldr	$t1,[sp,#0]
    712 	stp	$A,$B,[$ctx,#0]
    713 	add	$G,$G,$t2
    714 	 mov	$t2,wzr
    715 	stp	$C,$D,[$ctx,#8]
    716 	add	$H,$H,$t3
    717 	stp	$E,$F,[$ctx,#16]
    718 	 eor	$t3,$B,$C
    719 	stp	$G,$H,[$ctx,#24]
    720 	 mov	$t4,wzr
    721 	 mov	$Xfer,sp
    722 	b.ne	.L_00_48
    723 
    724 	ldr	x29,[x29]
    725 	add	sp,sp,#16*4+16
    726 	ret
    727 .size	sha256_block_neon,.-sha256_block_neon
    728 ___
    729 }
    730 
    731 if ($SZ==8) {
    732 my $Ktbl="x3";
    733 
    734 my @H = map("v$_.16b",(0..4));
    735 my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
    736 my @MSG=map("v$_.16b",(16..23));
    737 my ($W0,$W1)=("v24.2d","v25.2d");
    738 my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
    739 
    740 $code.=<<___;
    741 #ifndef	__KERNEL__
    742 .type	sha512_block_armv8,%function
    743 .align	6
    744 sha512_block_armv8:
    745 .Lv8_entry:
    746 	stp		x29,x30,[sp,#-16]!
    747 	add		x29,sp,#0
    748 
    749 	ld1		{@MSG[0]-@MSG[3]},[$inp],#64	// load input
    750 	ld1		{@MSG[4]-@MSG[7]},[$inp],#64
    751 
    752 	ld1.64		{@H[0]-@H[3]},[$ctx]		// load context
    753 	adr		$Ktbl,.LK512
    754 
    755 	rev64		@MSG[0],@MSG[0]
    756 	rev64		@MSG[1],@MSG[1]
    757 	rev64		@MSG[2],@MSG[2]
    758 	rev64		@MSG[3],@MSG[3]
    759 	rev64		@MSG[4],@MSG[4]
    760 	rev64		@MSG[5],@MSG[5]
    761 	rev64		@MSG[6],@MSG[6]
    762 	rev64		@MSG[7],@MSG[7]
    763 	b		.Loop_hw
    764 
    765 .align	4
    766 .Loop_hw:
    767 	ld1.64		{$W0},[$Ktbl],#16
    768 	subs		$num,$num,#1
    769 	sub		x4,$inp,#128
    770 	orr		$AB,@H[0],@H[0]			// offload
    771 	orr		$CD,@H[1],@H[1]
    772 	orr		$EF,@H[2],@H[2]
    773 	orr		$GH,@H[3],@H[3]
    774 	csel		$inp,$inp,x4,ne			// conditional rewind
    775 ___
    776 for($i=0;$i<32;$i++) {
    777 $code.=<<___;
    778 	add.i64		$W0,$W0,@MSG[0]
    779 	ld1.64		{$W1},[$Ktbl],#16
    780 	ext		$W0,$W0,$W0,#8
    781 	ext		$fg,@H[2],@H[3],#8
    782 	ext		$de,@H[1],@H[2],#8
    783 	add.i64		@H[3],@H[3],$W0			// "T1 + H + K512[i]"
    784 	 sha512su0	@MSG[0],@MSG[1]
    785 	 ext		$m9_10,@MSG[4],@MSG[5],#8
    786 	sha512h		@H[3],$fg,$de
    787 	 sha512su1	@MSG[0],@MSG[7],$m9_10
    788 	add.i64		@H[4],@H[1],@H[3]		// "D + T1"
    789 	sha512h2	@H[3],$H[1],@H[0]
    790 ___
    791 	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    792 	@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
    793 }
    794 for(;$i<40;$i++) {
    795 $code.=<<___	if ($i<39);
    796 	ld1.64		{$W1},[$Ktbl],#16
    797 ___
    798 $code.=<<___	if ($i==39);
    799 	sub		$Ktbl,$Ktbl,#$rounds*$SZ	// rewind
    800 ___
    801 $code.=<<___;
    802 	add.i64		$W0,$W0,@MSG[0]
    803 	 ld1		{@MSG[0]},[$inp],#16		// load next input
    804 	ext		$W0,$W0,$W0,#8
    805 	ext		$fg,@H[2],@H[3],#8
    806 	ext		$de,@H[1],@H[2],#8
    807 	add.i64		@H[3],@H[3],$W0			// "T1 + H + K512[i]"
    808 	sha512h		@H[3],$fg,$de
    809 	 rev64		@MSG[0],@MSG[0]
    810 	add.i64		@H[4],@H[1],@H[3]		// "D + T1"
    811 	sha512h2	@H[3],$H[1],@H[0]
    812 ___
    813 	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
    814 	@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
    815 }
    816 $code.=<<___;
    817 	add.i64		@H[0],@H[0],$AB			// accumulate
    818 	add.i64		@H[1],@H[1],$CD
    819 	add.i64		@H[2],@H[2],$EF
    820 	add.i64		@H[3],@H[3],$GH
    821 
    822 	cbnz		$num,.Loop_hw
    823 
    824 	st1.64		{@H[0]-@H[3]},[$ctx]		// store context
    825 
    826 	ldr		x29,[sp],#16
    827 	ret
    828 .size	sha512_block_armv8,.-sha512_block_armv8
    829 #endif
    830 ___
    831 }
    832 
    833 {   my  %opcode = (
    834 	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
    835 	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
    836 
    837     sub unsha256 {
    838 	my ($mnemonic,$arg)=@_;
    839 
    840 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
    841 	&&
    842 	sprintf ".inst\t0x%08x\t//%s %s",
    843 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
    844 			$mnemonic,$arg;
    845     }
    846 }
    847 
    848 {   my  %opcode = (
    849 	"sha512h"	=> 0xce608000,	"sha512h2"	=> 0xce608400,
    850 	"sha512su0"	=> 0xcec08000,	"sha512su1"	=> 0xce608800	);
    851 
    852     sub unsha512 {
    853 	my ($mnemonic,$arg)=@_;
    854 
    855 	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
    856 	&&
    857 	sprintf ".inst\t0x%08x\t//%s %s",
    858 			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
    859 			$mnemonic,$arg;
    860     }
    861 }
    862 
    863 open SELF,$0;
    864 while(<SELF>) {
    865         next if (/^#!/);
    866         last if (!s/^#/\/\// and !/^$/);
    867         print;
    868 }
    869 close SELF;
    870 
    871 foreach(split("\n",$code)) {
    872 
    873 	s/\`([^\`]*)\`/eval($1)/ge;
    874 
    875 	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
    876 	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
    877 
    878 	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
    879 
    880 	s/\.[ui]?8(\s)/$1/;
    881 	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
    882 	s/\.\w?32\b//		and s/\.16b/\.4s/g;
    883 	m/\bext\b/		and s/\.2d/\.16b/g	or
    884 	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
    885 
    886 	print $_,"\n";
    887 }
    888 
    889 close STDOUT or die "error closing STDOUT: $!";
    890