Home | History | Annotate | Line # | Download | only in asm
      1 #! /usr/bin/env perl
      2 # Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved.
      3 #
      4 # Licensed under the Apache License 2.0 (the "License").  You may not use
      5 # this file except in compliance with the License.  You can obtain a copy
      6 # in the file LICENSE in the source distribution or at
      7 # https://www.openssl.org/source/license.html
      8 
      9 
     10 # ====================================================================
     11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL
     12 # project. The module is, however, dual licensed under OpenSSL and
     13 # CRYPTOGAMS licenses depending on where you obtain it. For further
     14 # details see http://www.openssl.org/~appro/cryptogams/.
     15 #
     16 # Permission to use under GPL terms is granted.
     17 # ====================================================================
     18 
     19 # SHA512 block procedure for ARMv4. September 2007.
     20 
     21 # This code is ~4.5 (four and a half) times faster than code generated
     22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
     23 # Xscale PXA250 core].
     24 #
     25 # July 2010.
     26 #
     27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
     28 # Cortex A8 core and ~40 cycles per processed byte.
     29 
     30 # February 2011.
     31 #
     32 # Profiler-assisted and platform-specific optimization resulted in 7%
     33 # improvement on Coxtex A8 core and ~38 cycles per byte.
     34 
     35 # March 2011.
     36 #
     37 # Add NEON implementation. On Cortex A8 it was measured to process
     38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
     39 
     40 # August 2012.
     41 #
     42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
     43 # terms it's 22.6 cycles per byte, which is disappointing result.
     44 # Technical writers asserted that 3-way S4 pipeline can sustain
     45 # multiple NEON instructions per cycle, but dual NEON issue could
     46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
     47 # for further details. On side note Cortex-A15 processes one byte in
     48 # 16 cycles.
     49 
     50 # Byte order [in]dependence. =========================================
     51 #
     52 # Originally caller was expected to maintain specific *dword* order in
     53 # h[0-7], namely with most significant dword at *lower* address, which
     54 # was reflected in below two parameters as 0 and 4. Now caller is
     55 # expected to maintain native byte order for whole 64-bit values.
     56 $hi="HI";
     57 $lo="LO";
     58 # ====================================================================
     59 
     60 # $output is the last argument if it looks like a file (it has an extension)
     61 # $flavour is the first argument if it doesn't look like a file
     62 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
     63 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
     64 
     65 if ($flavour && $flavour ne "void") {
     66     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     67     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
     68     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
     69     die "can't locate arm-xlate.pl";
     70 
     71     open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
     72         or die "can't call $xlate: $!";
     73 } else {
     74     $output and open STDOUT,">$output";
     75 }
     76 
     77 $ctx="r0";	# parameter block
     78 $inp="r1";
     79 $len="r2";
     80 
     81 $Tlo="r3";
     82 $Thi="r4";
     83 $Alo="r5";
     84 $Ahi="r6";
     85 $Elo="r7";
     86 $Ehi="r8";
     87 $t0="r9";
     88 $t1="r10";
     89 $t2="r11";
     90 $t3="r12";
     91 ############	r13 is stack pointer
     92 $Ktbl="r14";
     93 ############	r15 is program counter
     94 
     95 $Aoff=8*0;
     96 $Boff=8*1;
     97 $Coff=8*2;
     98 $Doff=8*3;
     99 $Eoff=8*4;
    100 $Foff=8*5;
    101 $Goff=8*6;
    102 $Hoff=8*7;
    103 $Xoff=8*8;
    104 
    105 sub BODY_00_15() {
    106 my $magic = shift;
    107 $code.=<<___;
    108 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
    109 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
    110 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
    111 	mov	$t0,$Elo,lsr#14
    112 	str	$Tlo,[sp,#$Xoff+0]
    113 	mov	$t1,$Ehi,lsr#14
    114 	str	$Thi,[sp,#$Xoff+4]
    115 	eor	$t0,$t0,$Ehi,lsl#18
    116 	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
    117 	eor	$t1,$t1,$Elo,lsl#18
    118 	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
    119 	eor	$t0,$t0,$Elo,lsr#18
    120 	eor	$t1,$t1,$Ehi,lsr#18
    121 	eor	$t0,$t0,$Ehi,lsl#14
    122 	eor	$t1,$t1,$Elo,lsl#14
    123 	eor	$t0,$t0,$Ehi,lsr#9
    124 	eor	$t1,$t1,$Elo,lsr#9
    125 	eor	$t0,$t0,$Elo,lsl#23
    126 	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
    127 	adds	$Tlo,$Tlo,$t0
    128 	ldr	$t0,[sp,#$Foff+0]	@ f.lo
    129 	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
    130 	ldr	$t1,[sp,#$Foff+4]	@ f.hi
    131 	adds	$Tlo,$Tlo,$t2
    132 	ldr	$t2,[sp,#$Goff+0]	@ g.lo
    133 	adc	$Thi,$Thi,$t3		@ T += h
    134 	ldr	$t3,[sp,#$Goff+4]	@ g.hi
    135 
    136 	eor	$t0,$t0,$t2
    137 	str	$Elo,[sp,#$Eoff+0]
    138 	eor	$t1,$t1,$t3
    139 	str	$Ehi,[sp,#$Eoff+4]
    140 	and	$t0,$t0,$Elo
    141 	str	$Alo,[sp,#$Aoff+0]
    142 	and	$t1,$t1,$Ehi
    143 	str	$Ahi,[sp,#$Aoff+4]
    144 	eor	$t0,$t0,$t2
    145 	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
    146 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
    147 	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
    148 
    149 	adds	$Tlo,$Tlo,$t0
    150 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
    151 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
    152 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
    153 	adds	$Tlo,$Tlo,$t2
    154 	and	$t0,$t2,#0xff
    155 	adc	$Thi,$Thi,$t3		@ T += K[i]
    156 	adds	$Elo,$Elo,$Tlo
    157 	ldr	$t2,[sp,#$Boff+0]	@ b.lo
    158 	adc	$Ehi,$Ehi,$Thi		@ d += T
    159 	teq	$t0,#$magic
    160 
    161 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
    162 #ifdef	__thumb2__
    163 	it	eq			@ Thumb2 thing, sanity check in ARM
    164 #endif
    165 	orreq	$Ktbl,$Ktbl,#1
    166 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
    167 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
    168 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
    169 	mov	$t0,$Alo,lsr#28
    170 	mov	$t1,$Ahi,lsr#28
    171 	eor	$t0,$t0,$Ahi,lsl#4
    172 	eor	$t1,$t1,$Alo,lsl#4
    173 	eor	$t0,$t0,$Ahi,lsr#2
    174 	eor	$t1,$t1,$Alo,lsr#2
    175 	eor	$t0,$t0,$Alo,lsl#30
    176 	eor	$t1,$t1,$Ahi,lsl#30
    177 	eor	$t0,$t0,$Ahi,lsr#7
    178 	eor	$t1,$t1,$Alo,lsr#7
    179 	eor	$t0,$t0,$Alo,lsl#25
    180 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
    181 	adds	$Tlo,$Tlo,$t0
    182 	and	$t0,$Alo,$t2
    183 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
    184 
    185 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
    186 	orr	$Alo,$Alo,$t2
    187 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
    188 	and	$Alo,$Alo,$t3
    189 	and	$t3,$Ahi,$t1
    190 	orr	$Ahi,$Ahi,$t1
    191 	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
    192 	and	$Ahi,$Ahi,$t2
    193 	adds	$Alo,$Alo,$Tlo
    194 	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
    195 	sub	sp,sp,#8
    196 	adc	$Ahi,$Ahi,$Thi		@ h += T
    197 	tst	$Ktbl,#1
    198 	add	$Ktbl,$Ktbl,#8
    199 ___
    200 }
    201 
    202 my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
    203 
    204 $code=<<___;
    205 #ifndef __KERNEL__
    206 # include "arm_arch.h"
    207 # define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
    208 # define VFP_ABI_POP	vldmia	sp!,{d8-d15}
    209 #else
    210 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
    211 # define __ARM_MAX_ARCH__ 7
    212 # define VFP_ABI_PUSH
    213 # define VFP_ABI_POP
    214 #endif
    215 
    216 #ifdef __ARMEL__
    217 # define LO 0
    218 # define HI 4
    219 # define WORD64(hi0,lo0,hi1,lo1)	$_word	lo0,hi0, lo1,hi1
    220 #else
    221 # define HI 0
    222 # define LO 4
    223 # define WORD64(hi0,lo0,hi1,lo1)	$_word	hi0,lo0, hi1,lo1
    224 #endif
    225 
    226 #if defined(__thumb2__)
    227 .syntax unified
    228 .thumb
    229 # define adrl adr
    230 #else
    231 .code	32
    232 #endif
    233 
    234 .text
    235 
    236 .type	K512,%object
    237 .align	5
    238 K512:
    239 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
    240 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
    241 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
    242 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
    243 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
    244 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
    245 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
    246 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
    247 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
    248 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
    249 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
    250 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
    251 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
    252 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
    253 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
    254 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
    255 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
    256 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
    257 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
    258 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
    259 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
    260 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
    261 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
    262 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
    263 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
    264 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
    265 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
    266 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
    267 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
    268 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
    269 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
    270 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
    271 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
    272 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
    273 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
    274 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
    275 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
    276 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
    277 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
    278 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
    279 .size	K512,.-K512
    280 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    281 .LOPENSSL_armcap:
    282 # ifdef	_WIN32
    283 .word	OPENSSL_armcap_P
    284 # else
    285 .word	OPENSSL_armcap_P-.Lsha512_block_data_order
    286 # endif
    287 .skip	32-4
    288 #else
    289 .skip	32
    290 #endif
    291 
    292 .global	sha512_block_data_order
    293 .type	sha512_block_data_order,%function
    294 sha512_block_data_order:
    295 .Lsha512_block_data_order:
    296 #if __ARM_ARCH__<7 && !defined(__thumb2__)
    297 	sub	r3,pc,#8		@ sha512_block_data_order
    298 #else
    299 	adr	r3,.Lsha512_block_data_order
    300 #endif
    301 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    302 	ldr	r12,.LOPENSSL_armcap
    303 # if !defined(_WIN32)
    304 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
    305 # endif
    306 # if defined(__APPLE__) || defined(_WIN32)
    307 	ldr	r12,[r12]
    308 # endif
    309 	tst	r12,#ARMV7_NEON
    310 	bne	.LNEON
    311 #endif
    312 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    313 	stmdb	sp!,{r4-r12,lr}
    314 	sub	$Ktbl,r3,#672		@ K512
    315 	sub	sp,sp,#9*8
    316 
    317 	ldr	$Elo,[$ctx,#$Eoff+$lo]
    318 	ldr	$Ehi,[$ctx,#$Eoff+$hi]
    319 	ldr	$t0, [$ctx,#$Goff+$lo]
    320 	ldr	$t1, [$ctx,#$Goff+$hi]
    321 	ldr	$t2, [$ctx,#$Hoff+$lo]
    322 	ldr	$t3, [$ctx,#$Hoff+$hi]
    323 .Loop:
    324 	str	$t0, [sp,#$Goff+0]
    325 	str	$t1, [sp,#$Goff+4]
    326 	str	$t2, [sp,#$Hoff+0]
    327 	str	$t3, [sp,#$Hoff+4]
    328 	ldr	$Alo,[$ctx,#$Aoff+$lo]
    329 	ldr	$Ahi,[$ctx,#$Aoff+$hi]
    330 	ldr	$Tlo,[$ctx,#$Boff+$lo]
    331 	ldr	$Thi,[$ctx,#$Boff+$hi]
    332 	ldr	$t0, [$ctx,#$Coff+$lo]
    333 	ldr	$t1, [$ctx,#$Coff+$hi]
    334 	ldr	$t2, [$ctx,#$Doff+$lo]
    335 	ldr	$t3, [$ctx,#$Doff+$hi]
    336 	str	$Tlo,[sp,#$Boff+0]
    337 	str	$Thi,[sp,#$Boff+4]
    338 	str	$t0, [sp,#$Coff+0]
    339 	str	$t1, [sp,#$Coff+4]
    340 	str	$t2, [sp,#$Doff+0]
    341 	str	$t3, [sp,#$Doff+4]
    342 	ldr	$Tlo,[$ctx,#$Foff+$lo]
    343 	ldr	$Thi,[$ctx,#$Foff+$hi]
    344 	str	$Tlo,[sp,#$Foff+0]
    345 	str	$Thi,[sp,#$Foff+4]
    346 
    347 .L00_15:
    348 #if __ARM_ARCH__<7
    349 	ldrb	$Tlo,[$inp,#7]
    350 	ldrb	$t0, [$inp,#6]
    351 	ldrb	$t1, [$inp,#5]
    352 	ldrb	$t2, [$inp,#4]
    353 	ldrb	$Thi,[$inp,#3]
    354 	ldrb	$t3, [$inp,#2]
    355 	orr	$Tlo,$Tlo,$t0,lsl#8
    356 	ldrb	$t0, [$inp,#1]
    357 	orr	$Tlo,$Tlo,$t1,lsl#16
    358 	ldrb	$t1, [$inp],#8
    359 	orr	$Tlo,$Tlo,$t2,lsl#24
    360 	orr	$Thi,$Thi,$t3,lsl#8
    361 	orr	$Thi,$Thi,$t0,lsl#16
    362 	orr	$Thi,$Thi,$t1,lsl#24
    363 #else
    364 	ldr	$Tlo,[$inp,#4]
    365 	ldr	$Thi,[$inp],#8
    366 #ifdef __ARMEL__
    367 	rev	$Tlo,$Tlo
    368 	rev	$Thi,$Thi
    369 #endif
    370 #endif
    371 ___
    372 	&BODY_00_15(0x94);
    373 $code.=<<___;
    374 	tst	$Ktbl,#1
    375 	beq	.L00_15
    376 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    377 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    378 	bic	$Ktbl,$Ktbl,#1
    379 .L16_79:
    380 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
    381 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
    382 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
    383 	mov	$Tlo,$t0,lsr#1
    384 	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
    385 	mov	$Thi,$t1,lsr#1
    386 	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
    387 	eor	$Tlo,$Tlo,$t1,lsl#31
    388 	eor	$Thi,$Thi,$t0,lsl#31
    389 	eor	$Tlo,$Tlo,$t0,lsr#8
    390 	eor	$Thi,$Thi,$t1,lsr#8
    391 	eor	$Tlo,$Tlo,$t1,lsl#24
    392 	eor	$Thi,$Thi,$t0,lsl#24
    393 	eor	$Tlo,$Tlo,$t0,lsr#7
    394 	eor	$Thi,$Thi,$t1,lsr#7
    395 	eor	$Tlo,$Tlo,$t1,lsl#25
    396 
    397 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
    398 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
    399 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
    400 	mov	$t0,$t2,lsr#19
    401 	mov	$t1,$t3,lsr#19
    402 	eor	$t0,$t0,$t3,lsl#13
    403 	eor	$t1,$t1,$t2,lsl#13
    404 	eor	$t0,$t0,$t3,lsr#29
    405 	eor	$t1,$t1,$t2,lsr#29
    406 	eor	$t0,$t0,$t2,lsl#3
    407 	eor	$t1,$t1,$t3,lsl#3
    408 	eor	$t0,$t0,$t2,lsr#6
    409 	eor	$t1,$t1,$t3,lsr#6
    410 	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
    411 	eor	$t0,$t0,$t3,lsl#26
    412 
    413 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
    414 	adds	$Tlo,$Tlo,$t0
    415 	ldr	$t0,[sp,#`$Xoff+8*16`+0]
    416 	adc	$Thi,$Thi,$t1
    417 
    418 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
    419 	adds	$Tlo,$Tlo,$t2
    420 	adc	$Thi,$Thi,$t3
    421 	adds	$Tlo,$Tlo,$t0
    422 	adc	$Thi,$Thi,$t1
    423 ___
    424 	&BODY_00_15(0x17);
    425 $code.=<<___;
    426 #ifdef	__thumb2__
    427 	ittt	eq			@ Thumb2 thing, sanity check in ARM
    428 #endif
    429 	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
    430 	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
    431 	beq	.L16_79
    432 	bic	$Ktbl,$Ktbl,#1
    433 
    434 	ldr	$Tlo,[sp,#$Boff+0]
    435 	ldr	$Thi,[sp,#$Boff+4]
    436 	ldr	$t0, [$ctx,#$Aoff+$lo]
    437 	ldr	$t1, [$ctx,#$Aoff+$hi]
    438 	ldr	$t2, [$ctx,#$Boff+$lo]
    439 	ldr	$t3, [$ctx,#$Boff+$hi]
    440 	adds	$t0,$Alo,$t0
    441 	str	$t0, [$ctx,#$Aoff+$lo]
    442 	adc	$t1,$Ahi,$t1
    443 	str	$t1, [$ctx,#$Aoff+$hi]
    444 	adds	$t2,$Tlo,$t2
    445 	str	$t2, [$ctx,#$Boff+$lo]
    446 	adc	$t3,$Thi,$t3
    447 	str	$t3, [$ctx,#$Boff+$hi]
    448 
    449 	ldr	$Alo,[sp,#$Coff+0]
    450 	ldr	$Ahi,[sp,#$Coff+4]
    451 	ldr	$Tlo,[sp,#$Doff+0]
    452 	ldr	$Thi,[sp,#$Doff+4]
    453 	ldr	$t0, [$ctx,#$Coff+$lo]
    454 	ldr	$t1, [$ctx,#$Coff+$hi]
    455 	ldr	$t2, [$ctx,#$Doff+$lo]
    456 	ldr	$t3, [$ctx,#$Doff+$hi]
    457 	adds	$t0,$Alo,$t0
    458 	str	$t0, [$ctx,#$Coff+$lo]
    459 	adc	$t1,$Ahi,$t1
    460 	str	$t1, [$ctx,#$Coff+$hi]
    461 	adds	$t2,$Tlo,$t2
    462 	str	$t2, [$ctx,#$Doff+$lo]
    463 	adc	$t3,$Thi,$t3
    464 	str	$t3, [$ctx,#$Doff+$hi]
    465 
    466 	ldr	$Tlo,[sp,#$Foff+0]
    467 	ldr	$Thi,[sp,#$Foff+4]
    468 	ldr	$t0, [$ctx,#$Eoff+$lo]
    469 	ldr	$t1, [$ctx,#$Eoff+$hi]
    470 	ldr	$t2, [$ctx,#$Foff+$lo]
    471 	ldr	$t3, [$ctx,#$Foff+$hi]
    472 	adds	$Elo,$Elo,$t0
    473 	str	$Elo,[$ctx,#$Eoff+$lo]
    474 	adc	$Ehi,$Ehi,$t1
    475 	str	$Ehi,[$ctx,#$Eoff+$hi]
    476 	adds	$t2,$Tlo,$t2
    477 	str	$t2, [$ctx,#$Foff+$lo]
    478 	adc	$t3,$Thi,$t3
    479 	str	$t3, [$ctx,#$Foff+$hi]
    480 
    481 	ldr	$Alo,[sp,#$Goff+0]
    482 	ldr	$Ahi,[sp,#$Goff+4]
    483 	ldr	$Tlo,[sp,#$Hoff+0]
    484 	ldr	$Thi,[sp,#$Hoff+4]
    485 	ldr	$t0, [$ctx,#$Goff+$lo]
    486 	ldr	$t1, [$ctx,#$Goff+$hi]
    487 	ldr	$t2, [$ctx,#$Hoff+$lo]
    488 	ldr	$t3, [$ctx,#$Hoff+$hi]
    489 	adds	$t0,$Alo,$t0
    490 	str	$t0, [$ctx,#$Goff+$lo]
    491 	adc	$t1,$Ahi,$t1
    492 	str	$t1, [$ctx,#$Goff+$hi]
    493 	adds	$t2,$Tlo,$t2
    494 	str	$t2, [$ctx,#$Hoff+$lo]
    495 	adc	$t3,$Thi,$t3
    496 	str	$t3, [$ctx,#$Hoff+$hi]
    497 
    498 	add	sp,sp,#640
    499 	sub	$Ktbl,$Ktbl,#640
    500 
    501 	teq	$inp,$len
    502 	bne	.Loop
    503 
    504 	add	sp,sp,#8*9		@ destroy frame
    505 #if __ARM_ARCH__>=5
    506 	ldmia	sp!,{r4-r12,pc}
    507 #else
    508 	ldmia	sp!,{r4-r12,lr}
    509 	tst	lr,#1
    510 	moveq	pc,lr			@ be binary compatible with V4, yet
    511 	bx	lr			@ interoperable with Thumb ISA:-)
    512 #endif
    513 .size	sha512_block_data_order,.-sha512_block_data_order
    514 ___
    515 
    516 {
    517 my @Sigma0=(28,34,39);
    518 my @Sigma1=(14,18,41);
    519 my @sigma0=(1, 8, 7);
    520 my @sigma1=(19,61,6);
    521 
    522 my $Ktbl="r3";
    523 my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
    524 
    525 my @X=map("d$_",(0..15));
    526 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
    527 
    528 sub NEON_00_15() {
    529 my $i=shift;
    530 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
    531 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
    532 
    533 $code.=<<___ if ($i<16 || $i&1);
    534 	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
    535 #if $i<16
    536 	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
    537 #endif
    538 	vshr.u64	$t1,$e,#@Sigma1[1]
    539 #if $i>0
    540 	 vadd.i64	$a,$Maj			@ h+=Maj from the past
    541 #endif
    542 	vshr.u64	$t2,$e,#@Sigma1[2]
    543 ___
    544 $code.=<<___;
    545 	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
    546 	vsli.64		$t0,$e,#`64-@Sigma1[0]`
    547 	vsli.64		$t1,$e,#`64-@Sigma1[1]`
    548 	vmov		$Ch,$e
    549 	vsli.64		$t2,$e,#`64-@Sigma1[2]`
    550 #if $i<16 && defined(__ARMEL__)
    551 	vrev64.8	@X[$i],@X[$i]
    552 #endif
    553 	veor		$t1,$t0
    554 	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
    555 	vshr.u64	$t0,$a,#@Sigma0[0]
    556 	veor		$t2,$t1			@ Sigma1(e)
    557 	vadd.i64	$T1,$Ch,$h
    558 	vshr.u64	$t1,$a,#@Sigma0[1]
    559 	vsli.64		$t0,$a,#`64-@Sigma0[0]`
    560 	vadd.i64	$T1,$t2
    561 	vshr.u64	$t2,$a,#@Sigma0[2]
    562 	vadd.i64	$K,@X[$i%16]
    563 	vsli.64		$t1,$a,#`64-@Sigma0[1]`
    564 	veor		$Maj,$a,$b
    565 	vsli.64		$t2,$a,#`64-@Sigma0[2]`
    566 	veor		$h,$t0,$t1
    567 	vadd.i64	$T1,$K
    568 	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
    569 	veor		$h,$t2			@ Sigma0(a)
    570 	vadd.i64	$d,$T1
    571 	vadd.i64	$Maj,$T1
    572 	@ vadd.i64	$h,$Maj
    573 ___
    574 }
    575 
    576 sub NEON_16_79() {
    577 my $i=shift;
    578 
    579 if ($i&1)	{ &NEON_00_15($i,@_); return; }
    580 
    581 # 2x-vectorized, therefore runs every 2nd round
    582 my @X=map("q$_",(0..7));			# view @X as 128-bit vector
    583 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
    584 my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
    585 my $e=@_[4];					# $e from NEON_00_15
    586 $i /= 2;
    587 $code.=<<___;
    588 	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
    589 	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
    590 	 vadd.i64	@_[0],d30			@ h+=Maj from the past
    591 	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
    592 	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
    593 	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
    594 	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
    595 	veor		$s1,$t0
    596 	vshr.u64	$t0,$s0,#@sigma0[0]
    597 	veor		$s1,$t1				@ sigma1(X[i+14])
    598 	vshr.u64	$t1,$s0,#@sigma0[1]
    599 	vadd.i64	@X[$i%8],$s1
    600 	vshr.u64	$s1,$s0,#@sigma0[2]
    601 	vsli.64		$t0,$s0,#`64-@sigma0[0]`
    602 	vsli.64		$t1,$s0,#`64-@sigma0[1]`
    603 	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
    604 	veor		$s1,$t0
    605 	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
    606 	vadd.i64	@X[$i%8],$s0
    607 	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
    608 	veor		$s1,$t1				@ sigma0(X[i+1])
    609 	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
    610 	vadd.i64	@X[$i%8],$s1
    611 ___
    612 	&NEON_00_15(2*$i,@_);
    613 }
    614 
    615 $code.=<<___;
    616 #if __ARM_MAX_ARCH__>=7
    617 .arch	armv7-a
    618 .fpu	neon
    619 
    620 .global	sha512_block_data_order_neon
    621 .type	sha512_block_data_order_neon,%function
    622 .align	4
    623 sha512_block_data_order_neon:
    624 .LNEON:
    625 	dmb				@ errata #451034 on early Cortex A8
    626 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
    627 	adr	$Ktbl,K512
    628 	VFP_ABI_PUSH
    629 	vldmia	$ctx,{$A-$H}		@ load context
    630 .Loop_neon:
    631 ___
    632 for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
    633 $code.=<<___;
    634 	mov		$cnt,#4
    635 .L16_79_neon:
    636 	subs		$cnt,#1
    637 ___
    638 for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
    639 $code.=<<___;
    640 	bne		.L16_79_neon
    641 
    642 	 vadd.i64	$A,d30		@ h+=Maj from the past
    643 	vldmia		$ctx,{d24-d31}	@ load context to temp
    644 	vadd.i64	q8,q12		@ vectorized accumulate
    645 	vadd.i64	q9,q13
    646 	vadd.i64	q10,q14
    647 	vadd.i64	q11,q15
    648 	vstmia		$ctx,{$A-$H}	@ save context
    649 	teq		$inp,$len
    650 	sub		$Ktbl,#640	@ rewind K512
    651 	bne		.Loop_neon
    652 
    653 	VFP_ABI_POP
    654 	ret				@ bx lr
    655 .size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
    656 #endif
    657 ___
    658 }
    659 $code.=<<___;
    660 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
    661 .align	2
    662 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
    663 .extern	OPENSSL_armcap_P
    664 .hidden	OPENSSL_armcap_P
    665 #endif
    666 ___
    667 
    668 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    669 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
    670 $code =~ s/\bret\b/bx	lr/gm;
    671 
    672 open SELF,$0;
    673 while(<SELF>) {
    674 	next if (/^#!/);
    675 	last if (!s/^#/@/ and !/^$/);
    676 	print;
    677 }
    678 close SELF;
    679 
    680 print $code;
    681 close STDOUT or die "error closing STDOUT: $!"; # enforce flush
    682