1 #include "arm_arch.h" 2 3 .text 4 5 // forward "declarations" are required for Apple 6 7 .hidden OPENSSL_armcap_P 8 .globl poly1305_init 9 .hidden poly1305_init 10 .globl poly1305_blocks 11 .hidden poly1305_blocks 12 .globl poly1305_emit 13 .hidden poly1305_emit 14 15 .type poly1305_init,%function 16 .align 5 17 poly1305_init: 18 AARCH64_VALID_CALL_TARGET 19 cmp x1,xzr 20 stp xzr,xzr,[x0] // zero hash value 21 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 22 23 csel x0,xzr,x0,eq 24 b.eq .Lno_key 25 26 adrp x17,OPENSSL_armcap_P 27 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 28 29 ldp x7,x8,[x1] // load key 30 mov x9,#0xfffffffc0fffffff 31 movk x9,#0x0fff,lsl#48 32 #ifdef __AARCH64EB__ 33 rev x7,x7 // flip bytes 34 rev x8,x8 35 #endif 36 and x7,x7,x9 // &=0ffffffc0fffffff 37 and x9,x9,#-4 38 and x8,x8,x9 // &=0ffffffc0ffffffc 39 stp x7,x8,[x0,#32] // save key value 40 41 tst w17,#ARMV7_NEON 42 43 adrp x12,poly1305_blocks 44 add x12,x12,#:lo12:.Lpoly1305_blocks 45 adrp x7,poly1305_blocks_neon 46 add x7,x7,#:lo12:.Lpoly1305_blocks_neon 47 adrp x13,poly1305_emit 48 add x13,x13,#:lo12:.Lpoly1305_emit 49 adrp x8,poly1305_emit_neon 50 add x8,x8,#:lo12:.Lpoly1305_emit_neon 51 52 csel x12,x12,x7,eq 53 csel x13,x13,x8,eq 54 55 #ifdef __ILP32__ 56 stp w12,w13,[x2] 57 #else 58 stp x12,x13,[x2] 59 #endif 60 61 mov x0,#1 62 .Lno_key: 63 ret 64 .size poly1305_init,.-poly1305_init 65 66 .type poly1305_blocks,%function 67 .align 5 68 poly1305_blocks: 69 .Lpoly1305_blocks: 70 // The symbol .Lpoly1305_blocks is not a .globl symbol 71 // but a pointer to it is returned by poly1305_init 72 AARCH64_VALID_CALL_TARGET 73 ands x2,x2,#-16 74 b.eq .Lno_data 75 76 ldp x4,x5,[x0] // load hash value 77 ldp x7,x8,[x0,#32] // load key value 78 ldr x6,[x0,#16] 79 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 80 b .Loop 81 82 .align 5 83 .Loop: 84 ldp x10,x11,[x1],#16 // load input 85 sub x2,x2,#16 86 #ifdef __AARCH64EB__ 87 rev x10,x10 88 rev x11,x11 89 #endif 90 adds x4,x4,x10 // accumulate input 91 adcs x5,x5,x11 92 93 mul x12,x4,x7 // h0*r0 94 adc x6,x6,x3 95 umulh x13,x4,x7 96 97 mul x10,x5,x9 // h1*5*r1 98 umulh x11,x5,x9 99 100 adds x12,x12,x10 101 mul x10,x4,x8 // h0*r1 102 adc x13,x13,x11 103 umulh x14,x4,x8 104 105 adds x13,x13,x10 106 mul x10,x5,x7 // h1*r0 107 adc x14,x14,xzr 108 umulh x11,x5,x7 109 110 adds x13,x13,x10 111 mul x10,x6,x9 // h2*5*r1 112 adc x14,x14,x11 113 mul x11,x6,x7 // h2*r0 114 115 adds x13,x13,x10 116 adc x14,x14,x11 117 118 and x10,x14,#-4 // final reduction 119 and x6,x14,#3 120 add x10,x10,x14,lsr#2 121 adds x4,x12,x10 122 adcs x5,x13,xzr 123 adc x6,x6,xzr 124 125 cbnz x2,.Loop 126 127 stp x4,x5,[x0] // store hash value 128 str x6,[x0,#16] 129 130 .Lno_data: 131 ret 132 .size poly1305_blocks,.-poly1305_blocks 133 134 .type poly1305_emit,%function 135 .align 5 136 poly1305_emit: 137 .Lpoly1305_emit: 138 // The symbol .poly1305_emit is not a .globl symbol 139 // but a pointer to it is returned by poly1305_init 140 AARCH64_VALID_CALL_TARGET 141 ldp x4,x5,[x0] // load hash base 2^64 142 ldr x6,[x0,#16] 143 ldp x10,x11,[x2] // load nonce 144 145 adds x12,x4,#5 // compare to modulus 146 adcs x13,x5,xzr 147 adc x14,x6,xzr 148 149 tst x14,#-4 // see if it's carried/borrowed 150 151 csel x4,x4,x12,eq 152 csel x5,x5,x13,eq 153 154 #ifdef __AARCH64EB__ 155 ror x10,x10,#32 // flip nonce words 156 ror x11,x11,#32 157 #endif 158 adds x4,x4,x10 // accumulate nonce 159 adc x5,x5,x11 160 #ifdef __AARCH64EB__ 161 rev x4,x4 // flip output bytes 162 rev x5,x5 163 #endif 164 stp x4,x5,[x1] // write result 165 166 ret 167 .size poly1305_emit,.-poly1305_emit 168 .type poly1305_mult,%function 169 .align 5 170 poly1305_mult: 171 mul x12,x4,x7 // h0*r0 172 umulh x13,x4,x7 173 174 mul x10,x5,x9 // h1*5*r1 175 umulh x11,x5,x9 176 177 adds x12,x12,x10 178 mul x10,x4,x8 // h0*r1 179 adc x13,x13,x11 180 umulh x14,x4,x8 181 182 adds x13,x13,x10 183 mul x10,x5,x7 // h1*r0 184 adc x14,x14,xzr 185 umulh x11,x5,x7 186 187 adds x13,x13,x10 188 mul x10,x6,x9 // h2*5*r1 189 adc x14,x14,x11 190 mul x11,x6,x7 // h2*r0 191 192 adds x13,x13,x10 193 adc x14,x14,x11 194 195 and x10,x14,#-4 // final reduction 196 and x6,x14,#3 197 add x10,x10,x14,lsr#2 198 adds x4,x12,x10 199 adcs x5,x13,xzr 200 adc x6,x6,xzr 201 202 ret 203 .size poly1305_mult,.-poly1305_mult 204 205 .type poly1305_splat,%function 206 .align 5 207 poly1305_splat: 208 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 209 ubfx x13,x4,#26,#26 210 extr x14,x5,x4,#52 211 and x14,x14,#0x03ffffff 212 ubfx x15,x5,#14,#26 213 extr x16,x6,x5,#40 214 215 str w12,[x0,#16*0] // r0 216 add w12,w13,w13,lsl#2 // r1*5 217 str w13,[x0,#16*1] // r1 218 add w13,w14,w14,lsl#2 // r2*5 219 str w12,[x0,#16*2] // s1 220 str w14,[x0,#16*3] // r2 221 add w14,w15,w15,lsl#2 // r3*5 222 str w13,[x0,#16*4] // s2 223 str w15,[x0,#16*5] // r3 224 add w15,w16,w16,lsl#2 // r4*5 225 str w14,[x0,#16*6] // s3 226 str w16,[x0,#16*7] // r4 227 str w15,[x0,#16*8] // s4 228 229 ret 230 .size poly1305_splat,.-poly1305_splat 231 232 .type poly1305_blocks_neon,%function 233 .align 5 234 poly1305_blocks_neon: 235 .Lpoly1305_blocks_neon: 236 // The symbol .Lpoly1305_blocks_neon is not a .globl symbol 237 // but a pointer to it is returned by poly1305_init 238 AARCH64_VALID_CALL_TARGET 239 ldr x17,[x0,#24] 240 cmp x2,#128 241 b.hs .Lblocks_neon 242 cbz x17,.Lpoly1305_blocks 243 244 .Lblocks_neon: 245 AARCH64_SIGN_LINK_REGISTER 246 stp x29,x30,[sp,#-80]! 247 add x29,sp,#0 248 249 ands x2,x2,#-16 250 b.eq .Lno_data_neon 251 252 cbz x17,.Lbase2_64_neon 253 254 ldp w10,w11,[x0] // load hash value base 2^26 255 ldp w12,w13,[x0,#8] 256 ldr w14,[x0,#16] 257 258 tst x2,#31 259 b.eq .Leven_neon 260 261 ldp x7,x8,[x0,#32] // load key value 262 263 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 264 lsr x5,x12,#12 265 adds x4,x4,x12,lsl#52 266 add x5,x5,x13,lsl#14 267 adc x5,x5,xzr 268 lsr x6,x14,#24 269 adds x5,x5,x14,lsl#40 270 adc x14,x6,xzr // can be partially reduced... 271 272 ldp x12,x13,[x1],#16 // load input 273 sub x2,x2,#16 274 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 275 276 and x10,x14,#-4 // ... so reduce 277 and x6,x14,#3 278 add x10,x10,x14,lsr#2 279 adds x4,x4,x10 280 adcs x5,x5,xzr 281 adc x6,x6,xzr 282 283 #ifdef __AARCH64EB__ 284 rev x12,x12 285 rev x13,x13 286 #endif 287 adds x4,x4,x12 // accumulate input 288 adcs x5,x5,x13 289 adc x6,x6,x3 290 291 bl poly1305_mult 292 ldr x30,[sp,#8] 293 294 cbz x3,.Lstore_base2_64_neon 295 296 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 297 ubfx x11,x4,#26,#26 298 extr x12,x5,x4,#52 299 and x12,x12,#0x03ffffff 300 ubfx x13,x5,#14,#26 301 extr x14,x6,x5,#40 302 303 cbnz x2,.Leven_neon 304 305 stp w10,w11,[x0] // store hash value base 2^26 306 stp w12,w13,[x0,#8] 307 str w14,[x0,#16] 308 b .Lno_data_neon 309 310 .align 4 311 .Lstore_base2_64_neon: 312 stp x4,x5,[x0] // store hash value base 2^64 313 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 314 b .Lno_data_neon 315 316 .align 4 317 .Lbase2_64_neon: 318 ldp x7,x8,[x0,#32] // load key value 319 320 ldp x4,x5,[x0] // load hash value base 2^64 321 ldr x6,[x0,#16] 322 323 tst x2,#31 324 b.eq .Linit_neon 325 326 ldp x12,x13,[x1],#16 // load input 327 sub x2,x2,#16 328 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 329 #ifdef __AARCH64EB__ 330 rev x12,x12 331 rev x13,x13 332 #endif 333 adds x4,x4,x12 // accumulate input 334 adcs x5,x5,x13 335 adc x6,x6,x3 336 337 bl poly1305_mult 338 339 .Linit_neon: 340 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 341 ubfx x11,x4,#26,#26 342 extr x12,x5,x4,#52 343 and x12,x12,#0x03ffffff 344 ubfx x13,x5,#14,#26 345 extr x14,x6,x5,#40 346 347 stp d8,d9,[sp,#16] // meet ABI requirements 348 stp d10,d11,[sp,#32] 349 stp d12,d13,[sp,#48] 350 stp d14,d15,[sp,#64] 351 352 fmov d24,x10 353 fmov d25,x11 354 fmov d26,x12 355 fmov d27,x13 356 fmov d28,x14 357 358 ////////////////////////////////// initialize r^n table 359 mov x4,x7 // r^1 360 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 361 mov x5,x8 362 mov x6,xzr 363 add x0,x0,#48+12 364 bl poly1305_splat 365 366 bl poly1305_mult // r^2 367 sub x0,x0,#4 368 bl poly1305_splat 369 370 bl poly1305_mult // r^3 371 sub x0,x0,#4 372 bl poly1305_splat 373 374 bl poly1305_mult // r^4 375 sub x0,x0,#4 376 bl poly1305_splat 377 ldr x30,[sp,#8] 378 379 add x16,x1,#32 380 adrp x17,.Lzeros 381 add x17,x17,#:lo12:.Lzeros 382 subs x2,x2,#64 383 csel x16,x17,x16,lo 384 385 mov x4,#1 386 stur x4,[x0,#-24] // set is_base2_26 387 sub x0,x0,#48 // restore original x0 388 b .Ldo_neon 389 390 .align 4 391 .Leven_neon: 392 add x16,x1,#32 393 adrp x17,.Lzeros 394 add x17,x17,#:lo12:.Lzeros 395 subs x2,x2,#64 396 csel x16,x17,x16,lo 397 398 stp d8,d9,[sp,#16] // meet ABI requirements 399 stp d10,d11,[sp,#32] 400 stp d12,d13,[sp,#48] 401 stp d14,d15,[sp,#64] 402 403 fmov d24,x10 404 fmov d25,x11 405 fmov d26,x12 406 fmov d27,x13 407 fmov d28,x14 408 409 .Ldo_neon: 410 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 411 ldp x9,x13,[x16],#48 412 413 lsl x3,x3,#24 414 add x15,x0,#48 415 416 #ifdef __AARCH64EB__ 417 rev x8,x8 418 rev x12,x12 419 rev x9,x9 420 rev x13,x13 421 #endif 422 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 423 and x5,x9,#0x03ffffff 424 ubfx x6,x8,#26,#26 425 ubfx x7,x9,#26,#26 426 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 427 extr x8,x12,x8,#52 428 extr x9,x13,x9,#52 429 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 430 fmov d14,x4 431 and x8,x8,#0x03ffffff 432 and x9,x9,#0x03ffffff 433 ubfx x10,x12,#14,#26 434 ubfx x11,x13,#14,#26 435 add x12,x3,x12,lsr#40 436 add x13,x3,x13,lsr#40 437 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 438 fmov d15,x6 439 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 440 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 441 fmov d16,x8 442 fmov d17,x10 443 fmov d18,x12 444 445 ldp x8,x12,[x1],#16 // inp[0:1] 446 ldp x9,x13,[x1],#48 447 448 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 449 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 450 ld1 {v8.4s},[x15] 451 452 #ifdef __AARCH64EB__ 453 rev x8,x8 454 rev x12,x12 455 rev x9,x9 456 rev x13,x13 457 #endif 458 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 459 and x5,x9,#0x03ffffff 460 ubfx x6,x8,#26,#26 461 ubfx x7,x9,#26,#26 462 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 463 extr x8,x12,x8,#52 464 extr x9,x13,x9,#52 465 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 466 fmov d9,x4 467 and x8,x8,#0x03ffffff 468 and x9,x9,#0x03ffffff 469 ubfx x10,x12,#14,#26 470 ubfx x11,x13,#14,#26 471 add x12,x3,x12,lsr#40 472 add x13,x3,x13,lsr#40 473 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 474 fmov d10,x6 475 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 476 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 477 movi v31.2d,#-1 478 fmov d11,x8 479 fmov d12,x10 480 fmov d13,x12 481 ushr v31.2d,v31.2d,#38 482 483 b.ls .Lskip_loop 484 485 .align 4 486 .Loop_neon: 487 //////////////////////////////////////////////////////////////// 488 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 489 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 490 // ___________________/ 491 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 492 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 493 // ___________________/ ____________________/ 494 // 495 // Note that we start with inp[2:3]*r^2. This is because it 496 // doesn't depend on reduction in previous iteration. 497 //////////////////////////////////////////////////////////////// 498 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 499 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 500 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 501 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 502 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 503 504 subs x2,x2,#64 505 umull v23.2d,v14.2s,v7.s[2] 506 csel x16,x17,x16,lo 507 umull v22.2d,v14.2s,v5.s[2] 508 umull v21.2d,v14.2s,v3.s[2] 509 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 510 umull v20.2d,v14.2s,v1.s[2] 511 ldp x9,x13,[x16],#48 512 umull v19.2d,v14.2s,v0.s[2] 513 #ifdef __AARCH64EB__ 514 rev x8,x8 515 rev x12,x12 516 rev x9,x9 517 rev x13,x13 518 #endif 519 520 umlal v23.2d,v15.2s,v5.s[2] 521 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 522 umlal v22.2d,v15.2s,v3.s[2] 523 and x5,x9,#0x03ffffff 524 umlal v21.2d,v15.2s,v1.s[2] 525 ubfx x6,x8,#26,#26 526 umlal v20.2d,v15.2s,v0.s[2] 527 ubfx x7,x9,#26,#26 528 umlal v19.2d,v15.2s,v8.s[2] 529 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 530 531 umlal v23.2d,v16.2s,v3.s[2] 532 extr x8,x12,x8,#52 533 umlal v22.2d,v16.2s,v1.s[2] 534 extr x9,x13,x9,#52 535 umlal v21.2d,v16.2s,v0.s[2] 536 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 537 umlal v20.2d,v16.2s,v8.s[2] 538 fmov d14,x4 539 umlal v19.2d,v16.2s,v6.s[2] 540 and x8,x8,#0x03ffffff 541 542 umlal v23.2d,v17.2s,v1.s[2] 543 and x9,x9,#0x03ffffff 544 umlal v22.2d,v17.2s,v0.s[2] 545 ubfx x10,x12,#14,#26 546 umlal v21.2d,v17.2s,v8.s[2] 547 ubfx x11,x13,#14,#26 548 umlal v20.2d,v17.2s,v6.s[2] 549 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 550 umlal v19.2d,v17.2s,v4.s[2] 551 fmov d15,x6 552 553 add v11.2s,v11.2s,v26.2s 554 add x12,x3,x12,lsr#40 555 umlal v23.2d,v18.2s,v0.s[2] 556 add x13,x3,x13,lsr#40 557 umlal v22.2d,v18.2s,v8.s[2] 558 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 559 umlal v21.2d,v18.2s,v6.s[2] 560 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 561 umlal v20.2d,v18.2s,v4.s[2] 562 fmov d16,x8 563 umlal v19.2d,v18.2s,v2.s[2] 564 fmov d17,x10 565 566 //////////////////////////////////////////////////////////////// 567 // (hash+inp[0:1])*r^4 and accumulate 568 569 add v9.2s,v9.2s,v24.2s 570 fmov d18,x12 571 umlal v22.2d,v11.2s,v1.s[0] 572 ldp x8,x12,[x1],#16 // inp[0:1] 573 umlal v19.2d,v11.2s,v6.s[0] 574 ldp x9,x13,[x1],#48 575 umlal v23.2d,v11.2s,v3.s[0] 576 umlal v20.2d,v11.2s,v8.s[0] 577 umlal v21.2d,v11.2s,v0.s[0] 578 #ifdef __AARCH64EB__ 579 rev x8,x8 580 rev x12,x12 581 rev x9,x9 582 rev x13,x13 583 #endif 584 585 add v10.2s,v10.2s,v25.2s 586 umlal v22.2d,v9.2s,v5.s[0] 587 umlal v23.2d,v9.2s,v7.s[0] 588 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 589 umlal v21.2d,v9.2s,v3.s[0] 590 and x5,x9,#0x03ffffff 591 umlal v19.2d,v9.2s,v0.s[0] 592 ubfx x6,x8,#26,#26 593 umlal v20.2d,v9.2s,v1.s[0] 594 ubfx x7,x9,#26,#26 595 596 add v12.2s,v12.2s,v27.2s 597 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 598 umlal v22.2d,v10.2s,v3.s[0] 599 extr x8,x12,x8,#52 600 umlal v23.2d,v10.2s,v5.s[0] 601 extr x9,x13,x9,#52 602 umlal v19.2d,v10.2s,v8.s[0] 603 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 604 umlal v21.2d,v10.2s,v1.s[0] 605 fmov d9,x4 606 umlal v20.2d,v10.2s,v0.s[0] 607 and x8,x8,#0x03ffffff 608 609 add v13.2s,v13.2s,v28.2s 610 and x9,x9,#0x03ffffff 611 umlal v22.2d,v12.2s,v0.s[0] 612 ubfx x10,x12,#14,#26 613 umlal v19.2d,v12.2s,v4.s[0] 614 ubfx x11,x13,#14,#26 615 umlal v23.2d,v12.2s,v1.s[0] 616 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 617 umlal v20.2d,v12.2s,v6.s[0] 618 fmov d10,x6 619 umlal v21.2d,v12.2s,v8.s[0] 620 add x12,x3,x12,lsr#40 621 622 umlal v22.2d,v13.2s,v8.s[0] 623 add x13,x3,x13,lsr#40 624 umlal v19.2d,v13.2s,v2.s[0] 625 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 626 umlal v23.2d,v13.2s,v0.s[0] 627 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 628 umlal v20.2d,v13.2s,v4.s[0] 629 fmov d11,x8 630 umlal v21.2d,v13.2s,v6.s[0] 631 fmov d12,x10 632 fmov d13,x12 633 634 ///////////////////////////////////////////////////////////////// 635 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 636 // and P. Schwabe 637 // 638 // [see discussion in poly1305-armv4 module] 639 640 ushr v29.2d,v22.2d,#26 641 xtn v27.2s,v22.2d 642 ushr v30.2d,v19.2d,#26 643 and v19.16b,v19.16b,v31.16b 644 add v23.2d,v23.2d,v29.2d // h3 -> h4 645 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 646 add v20.2d,v20.2d,v30.2d // h0 -> h1 647 648 ushr v29.2d,v23.2d,#26 649 xtn v28.2s,v23.2d 650 ushr v30.2d,v20.2d,#26 651 xtn v25.2s,v20.2d 652 bic v28.2s,#0xfc,lsl#24 653 add v21.2d,v21.2d,v30.2d // h1 -> h2 654 655 add v19.2d,v19.2d,v29.2d 656 shl v29.2d,v29.2d,#2 657 shrn v30.2s,v21.2d,#26 658 xtn v26.2s,v21.2d 659 add v19.2d,v19.2d,v29.2d // h4 -> h0 660 bic v25.2s,#0xfc,lsl#24 661 add v27.2s,v27.2s,v30.2s // h2 -> h3 662 bic v26.2s,#0xfc,lsl#24 663 664 shrn v29.2s,v19.2d,#26 665 xtn v24.2s,v19.2d 666 ushr v30.2s,v27.2s,#26 667 bic v27.2s,#0xfc,lsl#24 668 bic v24.2s,#0xfc,lsl#24 669 add v25.2s,v25.2s,v29.2s // h0 -> h1 670 add v28.2s,v28.2s,v30.2s // h3 -> h4 671 672 b.hi .Loop_neon 673 674 .Lskip_loop: 675 dup v16.2d,v16.d[0] 676 add v11.2s,v11.2s,v26.2s 677 678 //////////////////////////////////////////////////////////////// 679 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 680 681 adds x2,x2,#32 682 b.ne .Long_tail 683 684 dup v16.2d,v11.d[0] 685 add v14.2s,v9.2s,v24.2s 686 add v17.2s,v12.2s,v27.2s 687 add v15.2s,v10.2s,v25.2s 688 add v18.2s,v13.2s,v28.2s 689 690 .Long_tail: 691 dup v14.2d,v14.d[0] 692 umull2 v19.2d,v16.4s,v6.4s 693 umull2 v22.2d,v16.4s,v1.4s 694 umull2 v23.2d,v16.4s,v3.4s 695 umull2 v21.2d,v16.4s,v0.4s 696 umull2 v20.2d,v16.4s,v8.4s 697 698 dup v15.2d,v15.d[0] 699 umlal2 v19.2d,v14.4s,v0.4s 700 umlal2 v21.2d,v14.4s,v3.4s 701 umlal2 v22.2d,v14.4s,v5.4s 702 umlal2 v23.2d,v14.4s,v7.4s 703 umlal2 v20.2d,v14.4s,v1.4s 704 705 dup v17.2d,v17.d[0] 706 umlal2 v19.2d,v15.4s,v8.4s 707 umlal2 v22.2d,v15.4s,v3.4s 708 umlal2 v21.2d,v15.4s,v1.4s 709 umlal2 v23.2d,v15.4s,v5.4s 710 umlal2 v20.2d,v15.4s,v0.4s 711 712 dup v18.2d,v18.d[0] 713 umlal2 v22.2d,v17.4s,v0.4s 714 umlal2 v23.2d,v17.4s,v1.4s 715 umlal2 v19.2d,v17.4s,v4.4s 716 umlal2 v20.2d,v17.4s,v6.4s 717 umlal2 v21.2d,v17.4s,v8.4s 718 719 umlal2 v22.2d,v18.4s,v8.4s 720 umlal2 v19.2d,v18.4s,v2.4s 721 umlal2 v23.2d,v18.4s,v0.4s 722 umlal2 v20.2d,v18.4s,v4.4s 723 umlal2 v21.2d,v18.4s,v6.4s 724 725 b.eq .Lshort_tail 726 727 //////////////////////////////////////////////////////////////// 728 // (hash+inp[0:1])*r^4:r^3 and accumulate 729 730 add v9.2s,v9.2s,v24.2s 731 umlal v22.2d,v11.2s,v1.2s 732 umlal v19.2d,v11.2s,v6.2s 733 umlal v23.2d,v11.2s,v3.2s 734 umlal v20.2d,v11.2s,v8.2s 735 umlal v21.2d,v11.2s,v0.2s 736 737 add v10.2s,v10.2s,v25.2s 738 umlal v22.2d,v9.2s,v5.2s 739 umlal v19.2d,v9.2s,v0.2s 740 umlal v23.2d,v9.2s,v7.2s 741 umlal v20.2d,v9.2s,v1.2s 742 umlal v21.2d,v9.2s,v3.2s 743 744 add v12.2s,v12.2s,v27.2s 745 umlal v22.2d,v10.2s,v3.2s 746 umlal v19.2d,v10.2s,v8.2s 747 umlal v23.2d,v10.2s,v5.2s 748 umlal v20.2d,v10.2s,v0.2s 749 umlal v21.2d,v10.2s,v1.2s 750 751 add v13.2s,v13.2s,v28.2s 752 umlal v22.2d,v12.2s,v0.2s 753 umlal v19.2d,v12.2s,v4.2s 754 umlal v23.2d,v12.2s,v1.2s 755 umlal v20.2d,v12.2s,v6.2s 756 umlal v21.2d,v12.2s,v8.2s 757 758 umlal v22.2d,v13.2s,v8.2s 759 umlal v19.2d,v13.2s,v2.2s 760 umlal v23.2d,v13.2s,v0.2s 761 umlal v20.2d,v13.2s,v4.2s 762 umlal v21.2d,v13.2s,v6.2s 763 764 .Lshort_tail: 765 //////////////////////////////////////////////////////////////// 766 // horizontal add 767 768 addp v22.2d,v22.2d,v22.2d 769 ldp d8,d9,[sp,#16] // meet ABI requirements 770 addp v19.2d,v19.2d,v19.2d 771 ldp d10,d11,[sp,#32] 772 addp v23.2d,v23.2d,v23.2d 773 ldp d12,d13,[sp,#48] 774 addp v20.2d,v20.2d,v20.2d 775 ldp d14,d15,[sp,#64] 776 addp v21.2d,v21.2d,v21.2d 777 778 //////////////////////////////////////////////////////////////// 779 // lazy reduction, but without narrowing 780 781 ushr v29.2d,v22.2d,#26 782 and v22.16b,v22.16b,v31.16b 783 ushr v30.2d,v19.2d,#26 784 and v19.16b,v19.16b,v31.16b 785 786 add v23.2d,v23.2d,v29.2d // h3 -> h4 787 add v20.2d,v20.2d,v30.2d // h0 -> h1 788 789 ushr v29.2d,v23.2d,#26 790 and v23.16b,v23.16b,v31.16b 791 ushr v30.2d,v20.2d,#26 792 and v20.16b,v20.16b,v31.16b 793 add v21.2d,v21.2d,v30.2d // h1 -> h2 794 795 add v19.2d,v19.2d,v29.2d 796 shl v29.2d,v29.2d,#2 797 ushr v30.2d,v21.2d,#26 798 and v21.16b,v21.16b,v31.16b 799 add v19.2d,v19.2d,v29.2d // h4 -> h0 800 add v22.2d,v22.2d,v30.2d // h2 -> h3 801 802 ushr v29.2d,v19.2d,#26 803 and v19.16b,v19.16b,v31.16b 804 ushr v30.2d,v22.2d,#26 805 and v22.16b,v22.16b,v31.16b 806 add v20.2d,v20.2d,v29.2d // h0 -> h1 807 add v23.2d,v23.2d,v30.2d // h3 -> h4 808 809 //////////////////////////////////////////////////////////////// 810 // write the result, can be partially reduced 811 812 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 813 st1 {v23.s}[0],[x0] 814 815 .Lno_data_neon: 816 ldr x29,[sp],#80 817 AARCH64_VALIDATE_LINK_REGISTER 818 ret 819 .size poly1305_blocks_neon,.-poly1305_blocks_neon 820 821 .type poly1305_emit_neon,%function 822 .align 5 823 poly1305_emit_neon: 824 .Lpoly1305_emit_neon: 825 // The symbol .Lpoly1305_emit_neon is not a .globl symbol 826 // but a pointer to it is returned by poly1305_init 827 AARCH64_VALID_CALL_TARGET 828 ldr x17,[x0,#24] 829 cbz x17,poly1305_emit 830 831 ldp w10,w11,[x0] // load hash value base 2^26 832 ldp w12,w13,[x0,#8] 833 ldr w14,[x0,#16] 834 835 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 836 lsr x5,x12,#12 837 adds x4,x4,x12,lsl#52 838 add x5,x5,x13,lsl#14 839 adc x5,x5,xzr 840 lsr x6,x14,#24 841 adds x5,x5,x14,lsl#40 842 adc x6,x6,xzr // can be partially reduced... 843 844 ldp x10,x11,[x2] // load nonce 845 846 and x12,x6,#-4 // ... so reduce 847 add x12,x12,x6,lsr#2 848 and x6,x6,#3 849 adds x4,x4,x12 850 adcs x5,x5,xzr 851 adc x6,x6,xzr 852 853 adds x12,x4,#5 // compare to modulus 854 adcs x13,x5,xzr 855 adc x14,x6,xzr 856 857 tst x14,#-4 // see if it's carried/borrowed 858 859 csel x4,x4,x12,eq 860 csel x5,x5,x13,eq 861 862 #ifdef __AARCH64EB__ 863 ror x10,x10,#32 // flip nonce words 864 ror x11,x11,#32 865 #endif 866 adds x4,x4,x10 // accumulate nonce 867 adc x5,x5,x11 868 #ifdef __AARCH64EB__ 869 rev x4,x4 // flip output bytes 870 rev x5,x5 871 #endif 872 stp x4,x5,[x1] // write result 873 874 ret 875 .size poly1305_emit_neon,.-poly1305_emit_neon 876 877 .section .rodata 878 879 .align 5 880 .Lzeros: 881 .long 0,0,0,0,0,0,0,0 882 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 883 .align 2 884 .align 2 885