1 #! /usr/bin/env perl 2 # Copyright 2007-2023 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the Apache License 2.0 (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 10 # ==================================================================== 11 # Written by Andy Polyakov <appro (at] openssl.org> for the OpenSSL 12 # project. The module is, however, dual licensed under OpenSSL and 13 # CRYPTOGAMS licenses depending on where you obtain it. For further 14 # details see http://www.openssl.org/~appro/cryptogams/. 15 # 16 # Permission to use under GPL terms is granted. 17 # ==================================================================== 18 19 # SHA512 block procedure for ARMv4. September 2007. 20 21 # This code is ~4.5 (four and a half) times faster than code generated 22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue 23 # Xscale PXA250 core]. 24 # 25 # July 2010. 26 # 27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on 28 # Cortex A8 core and ~40 cycles per processed byte. 29 30 # February 2011. 31 # 32 # Profiler-assisted and platform-specific optimization resulted in 7% 33 # improvement on Coxtex A8 core and ~38 cycles per byte. 34 35 # March 2011. 36 # 37 # Add NEON implementation. On Cortex A8 it was measured to process 38 # one byte in 23.3 cycles or ~60% faster than integer-only code. 39 40 # August 2012. 41 # 42 # Improve NEON performance by 12% on Snapdragon S4. In absolute 43 # terms it's 22.6 cycles per byte, which is disappointing result. 44 # Technical writers asserted that 3-way S4 pipeline can sustain 45 # multiple NEON instructions per cycle, but dual NEON issue could 46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html 47 # for further details. On side note Cortex-A15 processes one byte in 48 # 16 cycles. 49 50 # Byte order [in]dependence. ========================================= 51 # 52 # Originally caller was expected to maintain specific *dword* order in 53 # h[0-7], namely with most significant dword at *lower* address, which 54 # was reflected in below two parameters as 0 and 4. Now caller is 55 # expected to maintain native byte order for whole 64-bit values. 56 $hi="HI"; 57 $lo="LO"; 58 # ==================================================================== 59 60 # $output is the last argument if it looks like a file (it has an extension) 61 # $flavour is the first argument if it doesn't look like a file 62 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 63 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 64 65 if ($flavour && $flavour ne "void") { 66 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 67 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 68 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 69 die "can't locate arm-xlate.pl"; 70 71 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 72 or die "can't call $xlate: $!"; 73 } else { 74 $output and open STDOUT,">$output"; 75 } 76 77 $ctx="r0"; # parameter block 78 $inp="r1"; 79 $len="r2"; 80 81 $Tlo="r3"; 82 $Thi="r4"; 83 $Alo="r5"; 84 $Ahi="r6"; 85 $Elo="r7"; 86 $Ehi="r8"; 87 $t0="r9"; 88 $t1="r10"; 89 $t2="r11"; 90 $t3="r12"; 91 ############ r13 is stack pointer 92 $Ktbl="r14"; 93 ############ r15 is program counter 94 95 $Aoff=8*0; 96 $Boff=8*1; 97 $Coff=8*2; 98 $Doff=8*3; 99 $Eoff=8*4; 100 $Foff=8*5; 101 $Goff=8*6; 102 $Hoff=8*7; 103 $Xoff=8*8; 104 105 sub BODY_00_15() { 106 my $magic = shift; 107 $code.=<<___; 108 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 109 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 110 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 111 mov $t0,$Elo,lsr#14 112 str $Tlo,[sp,#$Xoff+0] 113 mov $t1,$Ehi,lsr#14 114 str $Thi,[sp,#$Xoff+4] 115 eor $t0,$t0,$Ehi,lsl#18 116 ldr $t2,[sp,#$Hoff+0] @ h.lo 117 eor $t1,$t1,$Elo,lsl#18 118 ldr $t3,[sp,#$Hoff+4] @ h.hi 119 eor $t0,$t0,$Elo,lsr#18 120 eor $t1,$t1,$Ehi,lsr#18 121 eor $t0,$t0,$Ehi,lsl#14 122 eor $t1,$t1,$Elo,lsl#14 123 eor $t0,$t0,$Ehi,lsr#9 124 eor $t1,$t1,$Elo,lsr#9 125 eor $t0,$t0,$Elo,lsl#23 126 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 127 adds $Tlo,$Tlo,$t0 128 ldr $t0,[sp,#$Foff+0] @ f.lo 129 adc $Thi,$Thi,$t1 @ T += Sigma1(e) 130 ldr $t1,[sp,#$Foff+4] @ f.hi 131 adds $Tlo,$Tlo,$t2 132 ldr $t2,[sp,#$Goff+0] @ g.lo 133 adc $Thi,$Thi,$t3 @ T += h 134 ldr $t3,[sp,#$Goff+4] @ g.hi 135 136 eor $t0,$t0,$t2 137 str $Elo,[sp,#$Eoff+0] 138 eor $t1,$t1,$t3 139 str $Ehi,[sp,#$Eoff+4] 140 and $t0,$t0,$Elo 141 str $Alo,[sp,#$Aoff+0] 142 and $t1,$t1,$Ehi 143 str $Ahi,[sp,#$Aoff+4] 144 eor $t0,$t0,$t2 145 ldr $t2,[$Ktbl,#$lo] @ K[i].lo 146 eor $t1,$t1,$t3 @ Ch(e,f,g) 147 ldr $t3,[$Ktbl,#$hi] @ K[i].hi 148 149 adds $Tlo,$Tlo,$t0 150 ldr $Elo,[sp,#$Doff+0] @ d.lo 151 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 152 ldr $Ehi,[sp,#$Doff+4] @ d.hi 153 adds $Tlo,$Tlo,$t2 154 and $t0,$t2,#0xff 155 adc $Thi,$Thi,$t3 @ T += K[i] 156 adds $Elo,$Elo,$Tlo 157 ldr $t2,[sp,#$Boff+0] @ b.lo 158 adc $Ehi,$Ehi,$Thi @ d += T 159 teq $t0,#$magic 160 161 ldr $t3,[sp,#$Coff+0] @ c.lo 162 #ifdef __thumb2__ 163 it eq @ Thumb2 thing, sanity check in ARM 164 #endif 165 orreq $Ktbl,$Ktbl,#1 166 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 167 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 168 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 169 mov $t0,$Alo,lsr#28 170 mov $t1,$Ahi,lsr#28 171 eor $t0,$t0,$Ahi,lsl#4 172 eor $t1,$t1,$Alo,lsl#4 173 eor $t0,$t0,$Ahi,lsr#2 174 eor $t1,$t1,$Alo,lsr#2 175 eor $t0,$t0,$Alo,lsl#30 176 eor $t1,$t1,$Ahi,lsl#30 177 eor $t0,$t0,$Ahi,lsr#7 178 eor $t1,$t1,$Alo,lsr#7 179 eor $t0,$t0,$Alo,lsl#25 180 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 181 adds $Tlo,$Tlo,$t0 182 and $t0,$Alo,$t2 183 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 184 185 ldr $t1,[sp,#$Boff+4] @ b.hi 186 orr $Alo,$Alo,$t2 187 ldr $t2,[sp,#$Coff+4] @ c.hi 188 and $Alo,$Alo,$t3 189 and $t3,$Ahi,$t1 190 orr $Ahi,$Ahi,$t1 191 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo 192 and $Ahi,$Ahi,$t2 193 adds $Alo,$Alo,$Tlo 194 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi 195 sub sp,sp,#8 196 adc $Ahi,$Ahi,$Thi @ h += T 197 tst $Ktbl,#1 198 add $Ktbl,$Ktbl,#8 199 ___ 200 } 201 202 my $_word = ($flavour =~ /win/ ? "DCDU" : ".word"); 203 204 $code=<<___; 205 #ifndef __KERNEL__ 206 # include "arm_arch.h" 207 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} 208 # define VFP_ABI_POP vldmia sp!,{d8-d15} 209 #else 210 # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 211 # define __ARM_MAX_ARCH__ 7 212 # define VFP_ABI_PUSH 213 # define VFP_ABI_POP 214 #endif 215 216 #ifdef __ARMEL__ 217 # define LO 0 218 # define HI 4 219 # define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1 220 #else 221 # define HI 0 222 # define LO 4 223 # define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1 224 #endif 225 226 #if defined(__thumb2__) 227 .syntax unified 228 .thumb 229 # define adrl adr 230 #else 231 .code 32 232 #endif 233 234 .text 235 236 .type K512,%object 237 .align 5 238 K512: 239 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) 240 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) 241 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) 242 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) 243 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) 244 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) 245 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) 246 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) 247 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) 248 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) 249 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) 250 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) 251 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) 252 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) 253 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) 254 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) 255 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) 256 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) 257 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) 258 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) 259 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) 260 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) 261 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) 262 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) 263 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) 264 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) 265 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) 266 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) 267 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) 268 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) 269 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) 270 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) 271 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) 272 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) 273 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) 274 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) 275 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) 276 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) 277 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) 278 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) 279 .size K512,.-K512 280 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 281 .LOPENSSL_armcap: 282 # ifdef _WIN32 283 .word OPENSSL_armcap_P 284 # else 285 .word OPENSSL_armcap_P-.Lsha512_block_data_order 286 # endif 287 .skip 32-4 288 #else 289 .skip 32 290 #endif 291 292 .global sha512_block_data_order 293 .type sha512_block_data_order,%function 294 sha512_block_data_order: 295 .Lsha512_block_data_order: 296 #if __ARM_ARCH__<7 && !defined(__thumb2__) 297 sub r3,pc,#8 @ sha512_block_data_order 298 #else 299 adr r3,.Lsha512_block_data_order 300 #endif 301 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 302 ldr r12,.LOPENSSL_armcap 303 # if !defined(_WIN32) 304 ldr r12,[r3,r12] @ OPENSSL_armcap_P 305 # endif 306 # if defined(__APPLE__) || defined(_WIN32) 307 ldr r12,[r12] 308 # endif 309 tst r12,#ARMV7_NEON 310 bne .LNEON 311 #endif 312 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 313 stmdb sp!,{r4-r12,lr} 314 sub $Ktbl,r3,#672 @ K512 315 sub sp,sp,#9*8 316 317 ldr $Elo,[$ctx,#$Eoff+$lo] 318 ldr $Ehi,[$ctx,#$Eoff+$hi] 319 ldr $t0, [$ctx,#$Goff+$lo] 320 ldr $t1, [$ctx,#$Goff+$hi] 321 ldr $t2, [$ctx,#$Hoff+$lo] 322 ldr $t3, [$ctx,#$Hoff+$hi] 323 .Loop: 324 str $t0, [sp,#$Goff+0] 325 str $t1, [sp,#$Goff+4] 326 str $t2, [sp,#$Hoff+0] 327 str $t3, [sp,#$Hoff+4] 328 ldr $Alo,[$ctx,#$Aoff+$lo] 329 ldr $Ahi,[$ctx,#$Aoff+$hi] 330 ldr $Tlo,[$ctx,#$Boff+$lo] 331 ldr $Thi,[$ctx,#$Boff+$hi] 332 ldr $t0, [$ctx,#$Coff+$lo] 333 ldr $t1, [$ctx,#$Coff+$hi] 334 ldr $t2, [$ctx,#$Doff+$lo] 335 ldr $t3, [$ctx,#$Doff+$hi] 336 str $Tlo,[sp,#$Boff+0] 337 str $Thi,[sp,#$Boff+4] 338 str $t0, [sp,#$Coff+0] 339 str $t1, [sp,#$Coff+4] 340 str $t2, [sp,#$Doff+0] 341 str $t3, [sp,#$Doff+4] 342 ldr $Tlo,[$ctx,#$Foff+$lo] 343 ldr $Thi,[$ctx,#$Foff+$hi] 344 str $Tlo,[sp,#$Foff+0] 345 str $Thi,[sp,#$Foff+4] 346 347 .L00_15: 348 #if __ARM_ARCH__<7 349 ldrb $Tlo,[$inp,#7] 350 ldrb $t0, [$inp,#6] 351 ldrb $t1, [$inp,#5] 352 ldrb $t2, [$inp,#4] 353 ldrb $Thi,[$inp,#3] 354 ldrb $t3, [$inp,#2] 355 orr $Tlo,$Tlo,$t0,lsl#8 356 ldrb $t0, [$inp,#1] 357 orr $Tlo,$Tlo,$t1,lsl#16 358 ldrb $t1, [$inp],#8 359 orr $Tlo,$Tlo,$t2,lsl#24 360 orr $Thi,$Thi,$t3,lsl#8 361 orr $Thi,$Thi,$t0,lsl#16 362 orr $Thi,$Thi,$t1,lsl#24 363 #else 364 ldr $Tlo,[$inp,#4] 365 ldr $Thi,[$inp],#8 366 #ifdef __ARMEL__ 367 rev $Tlo,$Tlo 368 rev $Thi,$Thi 369 #endif 370 #endif 371 ___ 372 &BODY_00_15(0x94); 373 $code.=<<___; 374 tst $Ktbl,#1 375 beq .L00_15 376 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 377 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 378 bic $Ktbl,$Ktbl,#1 379 .L16_79: 380 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 381 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 382 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 383 mov $Tlo,$t0,lsr#1 384 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 385 mov $Thi,$t1,lsr#1 386 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 387 eor $Tlo,$Tlo,$t1,lsl#31 388 eor $Thi,$Thi,$t0,lsl#31 389 eor $Tlo,$Tlo,$t0,lsr#8 390 eor $Thi,$Thi,$t1,lsr#8 391 eor $Tlo,$Tlo,$t1,lsl#24 392 eor $Thi,$Thi,$t0,lsl#24 393 eor $Tlo,$Tlo,$t0,lsr#7 394 eor $Thi,$Thi,$t1,lsr#7 395 eor $Tlo,$Tlo,$t1,lsl#25 396 397 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) 398 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 399 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 400 mov $t0,$t2,lsr#19 401 mov $t1,$t3,lsr#19 402 eor $t0,$t0,$t3,lsl#13 403 eor $t1,$t1,$t2,lsl#13 404 eor $t0,$t0,$t3,lsr#29 405 eor $t1,$t1,$t2,lsr#29 406 eor $t0,$t0,$t2,lsl#3 407 eor $t1,$t1,$t3,lsl#3 408 eor $t0,$t0,$t2,lsr#6 409 eor $t1,$t1,$t3,lsr#6 410 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 411 eor $t0,$t0,$t3,lsl#26 412 413 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 414 adds $Tlo,$Tlo,$t0 415 ldr $t0,[sp,#`$Xoff+8*16`+0] 416 adc $Thi,$Thi,$t1 417 418 ldr $t1,[sp,#`$Xoff+8*16`+4] 419 adds $Tlo,$Tlo,$t2 420 adc $Thi,$Thi,$t3 421 adds $Tlo,$Tlo,$t0 422 adc $Thi,$Thi,$t1 423 ___ 424 &BODY_00_15(0x17); 425 $code.=<<___; 426 #ifdef __thumb2__ 427 ittt eq @ Thumb2 thing, sanity check in ARM 428 #endif 429 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 430 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 431 beq .L16_79 432 bic $Ktbl,$Ktbl,#1 433 434 ldr $Tlo,[sp,#$Boff+0] 435 ldr $Thi,[sp,#$Boff+4] 436 ldr $t0, [$ctx,#$Aoff+$lo] 437 ldr $t1, [$ctx,#$Aoff+$hi] 438 ldr $t2, [$ctx,#$Boff+$lo] 439 ldr $t3, [$ctx,#$Boff+$hi] 440 adds $t0,$Alo,$t0 441 str $t0, [$ctx,#$Aoff+$lo] 442 adc $t1,$Ahi,$t1 443 str $t1, [$ctx,#$Aoff+$hi] 444 adds $t2,$Tlo,$t2 445 str $t2, [$ctx,#$Boff+$lo] 446 adc $t3,$Thi,$t3 447 str $t3, [$ctx,#$Boff+$hi] 448 449 ldr $Alo,[sp,#$Coff+0] 450 ldr $Ahi,[sp,#$Coff+4] 451 ldr $Tlo,[sp,#$Doff+0] 452 ldr $Thi,[sp,#$Doff+4] 453 ldr $t0, [$ctx,#$Coff+$lo] 454 ldr $t1, [$ctx,#$Coff+$hi] 455 ldr $t2, [$ctx,#$Doff+$lo] 456 ldr $t3, [$ctx,#$Doff+$hi] 457 adds $t0,$Alo,$t0 458 str $t0, [$ctx,#$Coff+$lo] 459 adc $t1,$Ahi,$t1 460 str $t1, [$ctx,#$Coff+$hi] 461 adds $t2,$Tlo,$t2 462 str $t2, [$ctx,#$Doff+$lo] 463 adc $t3,$Thi,$t3 464 str $t3, [$ctx,#$Doff+$hi] 465 466 ldr $Tlo,[sp,#$Foff+0] 467 ldr $Thi,[sp,#$Foff+4] 468 ldr $t0, [$ctx,#$Eoff+$lo] 469 ldr $t1, [$ctx,#$Eoff+$hi] 470 ldr $t2, [$ctx,#$Foff+$lo] 471 ldr $t3, [$ctx,#$Foff+$hi] 472 adds $Elo,$Elo,$t0 473 str $Elo,[$ctx,#$Eoff+$lo] 474 adc $Ehi,$Ehi,$t1 475 str $Ehi,[$ctx,#$Eoff+$hi] 476 adds $t2,$Tlo,$t2 477 str $t2, [$ctx,#$Foff+$lo] 478 adc $t3,$Thi,$t3 479 str $t3, [$ctx,#$Foff+$hi] 480 481 ldr $Alo,[sp,#$Goff+0] 482 ldr $Ahi,[sp,#$Goff+4] 483 ldr $Tlo,[sp,#$Hoff+0] 484 ldr $Thi,[sp,#$Hoff+4] 485 ldr $t0, [$ctx,#$Goff+$lo] 486 ldr $t1, [$ctx,#$Goff+$hi] 487 ldr $t2, [$ctx,#$Hoff+$lo] 488 ldr $t3, [$ctx,#$Hoff+$hi] 489 adds $t0,$Alo,$t0 490 str $t0, [$ctx,#$Goff+$lo] 491 adc $t1,$Ahi,$t1 492 str $t1, [$ctx,#$Goff+$hi] 493 adds $t2,$Tlo,$t2 494 str $t2, [$ctx,#$Hoff+$lo] 495 adc $t3,$Thi,$t3 496 str $t3, [$ctx,#$Hoff+$hi] 497 498 add sp,sp,#640 499 sub $Ktbl,$Ktbl,#640 500 501 teq $inp,$len 502 bne .Loop 503 504 add sp,sp,#8*9 @ destroy frame 505 #if __ARM_ARCH__>=5 506 ldmia sp!,{r4-r12,pc} 507 #else 508 ldmia sp!,{r4-r12,lr} 509 tst lr,#1 510 moveq pc,lr @ be binary compatible with V4, yet 511 bx lr @ interoperable with Thumb ISA:-) 512 #endif 513 .size sha512_block_data_order,.-sha512_block_data_order 514 ___ 515 516 { 517 my @Sigma0=(28,34,39); 518 my @Sigma1=(14,18,41); 519 my @sigma0=(1, 8, 7); 520 my @sigma1=(19,61,6); 521 522 my $Ktbl="r3"; 523 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch 524 525 my @X=map("d$_",(0..15)); 526 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); 527 528 sub NEON_00_15() { 529 my $i=shift; 530 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 531 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps 532 533 $code.=<<___ if ($i<16 || $i&1); 534 vshr.u64 $t0,$e,#@Sigma1[0] @ $i 535 #if $i<16 536 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned 537 #endif 538 vshr.u64 $t1,$e,#@Sigma1[1] 539 #if $i>0 540 vadd.i64 $a,$Maj @ h+=Maj from the past 541 #endif 542 vshr.u64 $t2,$e,#@Sigma1[2] 543 ___ 544 $code.=<<___; 545 vld1.64 {$K},[$Ktbl,:64]! @ K[i++] 546 vsli.64 $t0,$e,#`64-@Sigma1[0]` 547 vsli.64 $t1,$e,#`64-@Sigma1[1]` 548 vmov $Ch,$e 549 vsli.64 $t2,$e,#`64-@Sigma1[2]` 550 #if $i<16 && defined(__ARMEL__) 551 vrev64.8 @X[$i],@X[$i] 552 #endif 553 veor $t1,$t0 554 vbsl $Ch,$f,$g @ Ch(e,f,g) 555 vshr.u64 $t0,$a,#@Sigma0[0] 556 veor $t2,$t1 @ Sigma1(e) 557 vadd.i64 $T1,$Ch,$h 558 vshr.u64 $t1,$a,#@Sigma0[1] 559 vsli.64 $t0,$a,#`64-@Sigma0[0]` 560 vadd.i64 $T1,$t2 561 vshr.u64 $t2,$a,#@Sigma0[2] 562 vadd.i64 $K,@X[$i%16] 563 vsli.64 $t1,$a,#`64-@Sigma0[1]` 564 veor $Maj,$a,$b 565 vsli.64 $t2,$a,#`64-@Sigma0[2]` 566 veor $h,$t0,$t1 567 vadd.i64 $T1,$K 568 vbsl $Maj,$c,$b @ Maj(a,b,c) 569 veor $h,$t2 @ Sigma0(a) 570 vadd.i64 $d,$T1 571 vadd.i64 $Maj,$T1 572 @ vadd.i64 $h,$Maj 573 ___ 574 } 575 576 sub NEON_16_79() { 577 my $i=shift; 578 579 if ($i&1) { &NEON_00_15($i,@_); return; } 580 581 # 2x-vectorized, therefore runs every 2nd round 582 my @X=map("q$_",(0..7)); # view @X as 128-bit vector 583 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps 584 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 585 my $e=@_[4]; # $e from NEON_00_15 586 $i /= 2; 587 $code.=<<___; 588 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] 589 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] 590 vadd.i64 @_[0],d30 @ h+=Maj from the past 591 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] 592 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` 593 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] 594 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` 595 veor $s1,$t0 596 vshr.u64 $t0,$s0,#@sigma0[0] 597 veor $s1,$t1 @ sigma1(X[i+14]) 598 vshr.u64 $t1,$s0,#@sigma0[1] 599 vadd.i64 @X[$i%8],$s1 600 vshr.u64 $s1,$s0,#@sigma0[2] 601 vsli.64 $t0,$s0,#`64-@sigma0[0]` 602 vsli.64 $t1,$s0,#`64-@sigma0[1]` 603 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] 604 veor $s1,$t0 605 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 606 vadd.i64 @X[$i%8],$s0 607 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 608 veor $s1,$t1 @ sigma0(X[i+1]) 609 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 610 vadd.i64 @X[$i%8],$s1 611 ___ 612 &NEON_00_15(2*$i,@_); 613 } 614 615 $code.=<<___; 616 #if __ARM_MAX_ARCH__>=7 617 .arch armv7-a 618 .fpu neon 619 620 .global sha512_block_data_order_neon 621 .type sha512_block_data_order_neon,%function 622 .align 4 623 sha512_block_data_order_neon: 624 .LNEON: 625 dmb @ errata #451034 on early Cortex A8 626 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 627 adr $Ktbl,K512 628 VFP_ABI_PUSH 629 vldmia $ctx,{$A-$H} @ load context 630 .Loop_neon: 631 ___ 632 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } 633 $code.=<<___; 634 mov $cnt,#4 635 .L16_79_neon: 636 subs $cnt,#1 637 ___ 638 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } 639 $code.=<<___; 640 bne .L16_79_neon 641 642 vadd.i64 $A,d30 @ h+=Maj from the past 643 vldmia $ctx,{d24-d31} @ load context to temp 644 vadd.i64 q8,q12 @ vectorized accumulate 645 vadd.i64 q9,q13 646 vadd.i64 q10,q14 647 vadd.i64 q11,q15 648 vstmia $ctx,{$A-$H} @ save context 649 teq $inp,$len 650 sub $Ktbl,#640 @ rewind K512 651 bne .Loop_neon 652 653 VFP_ABI_POP 654 ret @ bx lr 655 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon 656 #endif 657 ___ 658 } 659 $code.=<<___; 660 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 661 .align 2 662 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 663 .extern OPENSSL_armcap_P 664 .hidden OPENSSL_armcap_P 665 #endif 666 ___ 667 668 $code =~ s/\`([^\`]*)\`/eval $1/gem; 669 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 670 $code =~ s/\bret\b/bx lr/gm; 671 672 open SELF,$0; 673 while(<SELF>) { 674 next if (/^#!/); 675 last if (!s/^#/@/ and !/^$/); 676 print; 677 } 678 close SELF; 679 680 print $code; 681 close STDOUT or die "error closing STDOUT: $!"; # enforce flush 682