1 #include "arm_arch.h" 2 #ifndef __KERNEL__ 3 4 .hidden OPENSSL_armcap_P 5 6 7 #endif 8 9 .section .rodata 10 11 .align 5 12 .Lsigma: 13 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 14 .Lone: 15 .long 1,2,3,4 16 .Lrot24: 17 .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f 18 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 19 .align 2 20 21 .text 22 23 .globl ChaCha20_ctr32_dflt 24 .type ChaCha20_ctr32_dflt,%function 25 .align 5 26 ChaCha20_ctr32_dflt: 27 AARCH64_SIGN_LINK_REGISTER 28 cmp x2,#192 29 b.lo .Lshort 30 #ifndef __KERNEL__ 31 adrp x17,OPENSSL_armcap_P 32 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 33 .Lcheck_neon: 34 tst w17,#ARMV7_NEON 35 b.ne .LChaCha20_neon 36 #endif 37 38 .Lshort: 39 stp x29,x30,[sp,#-96]! 40 add x29,sp,#0 41 42 adrp x5,.Lsigma 43 add x5,x5,#:lo12:.Lsigma 44 stp x19,x20,[sp,#16] 45 stp x21,x22,[sp,#32] 46 stp x23,x24,[sp,#48] 47 stp x25,x26,[sp,#64] 48 stp x27,x28,[sp,#80] 49 sub sp,sp,#64 50 51 ldp x22,x23,[x5] // load sigma 52 ldp x24,x25,[x3] // load key 53 ldp x26,x27,[x3,#16] 54 ldp x28,x30,[x4] // load counter 55 #ifdef __AARCH64EB__ 56 ror x24,x24,#32 57 ror x25,x25,#32 58 ror x26,x26,#32 59 ror x27,x27,#32 60 ror x28,x28,#32 61 ror x30,x30,#32 62 #endif 63 64 .Loop_outer: 65 mov w5,w22 // unpack key block 66 lsr x6,x22,#32 67 mov w7,w23 68 lsr x8,x23,#32 69 mov w9,w24 70 lsr x10,x24,#32 71 mov w11,w25 72 lsr x12,x25,#32 73 mov w13,w26 74 lsr x14,x26,#32 75 mov w15,w27 76 lsr x16,x27,#32 77 mov w17,w28 78 lsr x19,x28,#32 79 mov w20,w30 80 lsr x21,x30,#32 81 82 mov x4,#10 83 subs x2,x2,#64 84 .Loop: 85 sub x4,x4,#1 86 add w5,w5,w9 87 add w6,w6,w10 88 add w7,w7,w11 89 add w8,w8,w12 90 eor w17,w17,w5 91 eor w19,w19,w6 92 eor w20,w20,w7 93 eor w21,w21,w8 94 ror w17,w17,#16 95 ror w19,w19,#16 96 ror w20,w20,#16 97 ror w21,w21,#16 98 add w13,w13,w17 99 add w14,w14,w19 100 add w15,w15,w20 101 add w16,w16,w21 102 eor w9,w9,w13 103 eor w10,w10,w14 104 eor w11,w11,w15 105 eor w12,w12,w16 106 ror w9,w9,#20 107 ror w10,w10,#20 108 ror w11,w11,#20 109 ror w12,w12,#20 110 add w5,w5,w9 111 add w6,w6,w10 112 add w7,w7,w11 113 add w8,w8,w12 114 eor w17,w17,w5 115 eor w19,w19,w6 116 eor w20,w20,w7 117 eor w21,w21,w8 118 ror w17,w17,#24 119 ror w19,w19,#24 120 ror w20,w20,#24 121 ror w21,w21,#24 122 add w13,w13,w17 123 add w14,w14,w19 124 add w15,w15,w20 125 add w16,w16,w21 126 eor w9,w9,w13 127 eor w10,w10,w14 128 eor w11,w11,w15 129 eor w12,w12,w16 130 ror w9,w9,#25 131 ror w10,w10,#25 132 ror w11,w11,#25 133 ror w12,w12,#25 134 add w5,w5,w10 135 add w6,w6,w11 136 add w7,w7,w12 137 add w8,w8,w9 138 eor w21,w21,w5 139 eor w17,w17,w6 140 eor w19,w19,w7 141 eor w20,w20,w8 142 ror w21,w21,#16 143 ror w17,w17,#16 144 ror w19,w19,#16 145 ror w20,w20,#16 146 add w15,w15,w21 147 add w16,w16,w17 148 add w13,w13,w19 149 add w14,w14,w20 150 eor w10,w10,w15 151 eor w11,w11,w16 152 eor w12,w12,w13 153 eor w9,w9,w14 154 ror w10,w10,#20 155 ror w11,w11,#20 156 ror w12,w12,#20 157 ror w9,w9,#20 158 add w5,w5,w10 159 add w6,w6,w11 160 add w7,w7,w12 161 add w8,w8,w9 162 eor w21,w21,w5 163 eor w17,w17,w6 164 eor w19,w19,w7 165 eor w20,w20,w8 166 ror w21,w21,#24 167 ror w17,w17,#24 168 ror w19,w19,#24 169 ror w20,w20,#24 170 add w15,w15,w21 171 add w16,w16,w17 172 add w13,w13,w19 173 add w14,w14,w20 174 eor w10,w10,w15 175 eor w11,w11,w16 176 eor w12,w12,w13 177 eor w9,w9,w14 178 ror w10,w10,#25 179 ror w11,w11,#25 180 ror w12,w12,#25 181 ror w9,w9,#25 182 cbnz x4,.Loop 183 184 add w5,w5,w22 // accumulate key block 185 add x6,x6,x22,lsr#32 186 add w7,w7,w23 187 add x8,x8,x23,lsr#32 188 add w9,w9,w24 189 add x10,x10,x24,lsr#32 190 add w11,w11,w25 191 add x12,x12,x25,lsr#32 192 add w13,w13,w26 193 add x14,x14,x26,lsr#32 194 add w15,w15,w27 195 add x16,x16,x27,lsr#32 196 add w17,w17,w28 197 add x19,x19,x28,lsr#32 198 add w20,w20,w30 199 add x21,x21,x30,lsr#32 200 201 b.lo .Ltail 202 203 add x5,x5,x6,lsl#32 // pack 204 add x7,x7,x8,lsl#32 205 ldp x6,x8,[x1,#0] // load input 206 add x9,x9,x10,lsl#32 207 add x11,x11,x12,lsl#32 208 ldp x10,x12,[x1,#16] 209 add x13,x13,x14,lsl#32 210 add x15,x15,x16,lsl#32 211 ldp x14,x16,[x1,#32] 212 add x17,x17,x19,lsl#32 213 add x20,x20,x21,lsl#32 214 ldp x19,x21,[x1,#48] 215 add x1,x1,#64 216 #ifdef __AARCH64EB__ 217 rev x5,x5 218 rev x7,x7 219 rev x9,x9 220 rev x11,x11 221 rev x13,x13 222 rev x15,x15 223 rev x17,x17 224 rev x20,x20 225 #endif 226 eor x5,x5,x6 227 eor x7,x7,x8 228 eor x9,x9,x10 229 eor x11,x11,x12 230 eor x13,x13,x14 231 eor x15,x15,x16 232 eor x17,x17,x19 233 eor x20,x20,x21 234 235 stp x5,x7,[x0,#0] // store output 236 add x28,x28,#1 // increment counter 237 stp x9,x11,[x0,#16] 238 stp x13,x15,[x0,#32] 239 stp x17,x20,[x0,#48] 240 add x0,x0,#64 241 242 b.hi .Loop_outer 243 244 ldp x19,x20,[x29,#16] 245 add sp,sp,#64 246 ldp x21,x22,[x29,#32] 247 ldp x23,x24,[x29,#48] 248 ldp x25,x26,[x29,#64] 249 ldp x27,x28,[x29,#80] 250 ldp x29,x30,[sp],#96 251 .Labort: 252 AARCH64_VALIDATE_LINK_REGISTER 253 ret 254 255 .align 4 256 .Ltail: 257 add x2,x2,#64 258 .Less_than_64: 259 sub x0,x0,#1 260 add x1,x1,x2 261 add x0,x0,x2 262 add x4,sp,x2 263 neg x2,x2 264 265 add x5,x5,x6,lsl#32 // pack 266 add x7,x7,x8,lsl#32 267 add x9,x9,x10,lsl#32 268 add x11,x11,x12,lsl#32 269 add x13,x13,x14,lsl#32 270 add x15,x15,x16,lsl#32 271 add x17,x17,x19,lsl#32 272 add x20,x20,x21,lsl#32 273 #ifdef __AARCH64EB__ 274 rev x5,x5 275 rev x7,x7 276 rev x9,x9 277 rev x11,x11 278 rev x13,x13 279 rev x15,x15 280 rev x17,x17 281 rev x20,x20 282 #endif 283 stp x5,x7,[sp,#0] 284 stp x9,x11,[sp,#16] 285 stp x13,x15,[sp,#32] 286 stp x17,x20,[sp,#48] 287 288 .Loop_tail: 289 ldrb w10,[x1,x2] 290 ldrb w11,[x4,x2] 291 add x2,x2,#1 292 eor w10,w10,w11 293 strb w10,[x0,x2] 294 cbnz x2,.Loop_tail 295 296 stp xzr,xzr,[sp,#0] 297 stp xzr,xzr,[sp,#16] 298 stp xzr,xzr,[sp,#32] 299 stp xzr,xzr,[sp,#48] 300 301 ldp x19,x20,[x29,#16] 302 add sp,sp,#64 303 ldp x21,x22,[x29,#32] 304 ldp x23,x24,[x29,#48] 305 ldp x25,x26,[x29,#64] 306 ldp x27,x28,[x29,#80] 307 ldp x29,x30,[sp],#96 308 AARCH64_VALIDATE_LINK_REGISTER 309 ret 310 .size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt 311 312 .globl ChaCha20_ctr32 313 .type ChaCha20_ctr32,%function 314 .align 5 315 ChaCha20_ctr32: 316 AARCH64_SIGN_LINK_REGISTER 317 cbz x2,.Labort 318 cmp x2,#192 319 b.lo .Lshort 320 #ifndef __KERNEL__ 321 adrp x17,OPENSSL_armcap_P 322 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 323 tst w17,#ARMV8_SVE 324 b.eq .Lcheck_neon 325 stp x29,x30,[sp,#-16]! 326 sub sp,sp,#16 327 // SVE handling will inevitably increment the counter 328 // Neon/Scalar code that follows to process tail data needs to 329 // use new counter, unfortunately the input counter buffer 330 // pointed to by ctr is meant to be read-only per API contract 331 // we have to copy the buffer to stack to be writable by SVE 332 ldp x5,x6,[x4] 333 stp x5,x6,[sp] 334 mov x4,sp 335 bl ChaCha20_ctr32_sve 336 cbz x2,1f 337 bl ChaCha20_ctr32_dflt 338 1: 339 add sp,sp,#16 340 ldp x29,x30,[sp],#16 341 AARCH64_VALIDATE_LINK_REGISTER 342 ret 343 #endif 344 b .Lshort 345 .size ChaCha20_ctr32,.-ChaCha20_ctr32 346 347 #ifdef __KERNEL__ 348 .globl ChaCha20_neon 349 #endif 350 .type ChaCha20_neon,%function 351 .align 5 352 ChaCha20_neon: 353 AARCH64_SIGN_LINK_REGISTER 354 .LChaCha20_neon: 355 stp x29,x30,[sp,#-96]! 356 add x29,sp,#0 357 358 adrp x5,.Lsigma 359 add x5,x5,#:lo12:.Lsigma 360 stp x19,x20,[sp,#16] 361 stp x21,x22,[sp,#32] 362 stp x23,x24,[sp,#48] 363 stp x25,x26,[sp,#64] 364 stp x27,x28,[sp,#80] 365 cmp x2,#512 366 b.hs .L512_or_more_neon 367 368 sub sp,sp,#64 369 370 ldp x22,x23,[x5] // load sigma 371 ld1 {v0.4s},[x5],#16 372 ldp x24,x25,[x3] // load key 373 ldp x26,x27,[x3,#16] 374 ld1 {v1.4s,v2.4s},[x3] 375 ldp x28,x30,[x4] // load counter 376 ld1 {v3.4s},[x4] 377 stp d8,d9,[sp] // meet ABI requirements 378 ld1 {v8.4s,v9.4s},[x5] 379 #ifdef __AARCH64EB__ 380 rev64 v0.4s,v0.4s 381 ror x24,x24,#32 382 ror x25,x25,#32 383 ror x26,x26,#32 384 ror x27,x27,#32 385 ror x28,x28,#32 386 ror x30,x30,#32 387 #endif 388 389 .Loop_outer_neon: 390 dup v16.4s,v0.s[0] // unpack key block 391 mov w5,w22 392 dup v20.4s,v0.s[1] 393 lsr x6,x22,#32 394 dup v24.4s,v0.s[2] 395 mov w7,w23 396 dup v28.4s,v0.s[3] 397 lsr x8,x23,#32 398 dup v17.4s,v1.s[0] 399 mov w9,w24 400 dup v21.4s,v1.s[1] 401 lsr x10,x24,#32 402 dup v25.4s,v1.s[2] 403 mov w11,w25 404 dup v29.4s,v1.s[3] 405 lsr x12,x25,#32 406 dup v19.4s,v3.s[0] 407 mov w13,w26 408 dup v23.4s,v3.s[1] 409 lsr x14,x26,#32 410 dup v27.4s,v3.s[2] 411 mov w15,w27 412 dup v31.4s,v3.s[3] 413 lsr x16,x27,#32 414 add v19.4s,v19.4s,v8.4s 415 mov w17,w28 416 dup v18.4s,v2.s[0] 417 lsr x19,x28,#32 418 dup v22.4s,v2.s[1] 419 mov w20,w30 420 dup v26.4s,v2.s[2] 421 lsr x21,x30,#32 422 dup v30.4s,v2.s[3] 423 424 mov x4,#10 425 subs x2,x2,#320 426 .Loop_neon: 427 sub x4,x4,#1 428 add v16.4s,v16.4s,v17.4s 429 add w5,w5,w9 430 add v20.4s,v20.4s,v21.4s 431 add w6,w6,w10 432 add v24.4s,v24.4s,v25.4s 433 add w7,w7,w11 434 add v28.4s,v28.4s,v29.4s 435 add w8,w8,w12 436 eor v19.16b,v19.16b,v16.16b 437 eor w17,w17,w5 438 eor v23.16b,v23.16b,v20.16b 439 eor w19,w19,w6 440 eor v27.16b,v27.16b,v24.16b 441 eor w20,w20,w7 442 eor v31.16b,v31.16b,v28.16b 443 eor w21,w21,w8 444 rev32 v19.8h,v19.8h 445 ror w17,w17,#16 446 rev32 v23.8h,v23.8h 447 ror w19,w19,#16 448 rev32 v27.8h,v27.8h 449 ror w20,w20,#16 450 rev32 v31.8h,v31.8h 451 ror w21,w21,#16 452 add v18.4s,v18.4s,v19.4s 453 add w13,w13,w17 454 add v22.4s,v22.4s,v23.4s 455 add w14,w14,w19 456 add v26.4s,v26.4s,v27.4s 457 add w15,w15,w20 458 add v30.4s,v30.4s,v31.4s 459 add w16,w16,w21 460 eor v4.16b,v17.16b,v18.16b 461 eor w9,w9,w13 462 eor v5.16b,v21.16b,v22.16b 463 eor w10,w10,w14 464 eor v6.16b,v25.16b,v26.16b 465 eor w11,w11,w15 466 eor v7.16b,v29.16b,v30.16b 467 eor w12,w12,w16 468 ushr v17.4s,v4.4s,#20 469 ror w9,w9,#20 470 ushr v21.4s,v5.4s,#20 471 ror w10,w10,#20 472 ushr v25.4s,v6.4s,#20 473 ror w11,w11,#20 474 ushr v29.4s,v7.4s,#20 475 ror w12,w12,#20 476 sli v17.4s,v4.4s,#12 477 add w5,w5,w9 478 sli v21.4s,v5.4s,#12 479 add w6,w6,w10 480 sli v25.4s,v6.4s,#12 481 add w7,w7,w11 482 sli v29.4s,v7.4s,#12 483 add w8,w8,w12 484 add v16.4s,v16.4s,v17.4s 485 eor w17,w17,w5 486 add v20.4s,v20.4s,v21.4s 487 eor w19,w19,w6 488 add v24.4s,v24.4s,v25.4s 489 eor w20,w20,w7 490 add v28.4s,v28.4s,v29.4s 491 eor w21,w21,w8 492 eor v4.16b,v19.16b,v16.16b 493 ror w17,w17,#24 494 eor v5.16b,v23.16b,v20.16b 495 ror w19,w19,#24 496 eor v6.16b,v27.16b,v24.16b 497 ror w20,w20,#24 498 eor v7.16b,v31.16b,v28.16b 499 ror w21,w21,#24 500 tbl v19.16b,{v4.16b},v9.16b 501 add w13,w13,w17 502 tbl v23.16b,{v5.16b},v9.16b 503 add w14,w14,w19 504 tbl v27.16b,{v6.16b},v9.16b 505 add w15,w15,w20 506 tbl v31.16b,{v7.16b},v9.16b 507 add w16,w16,w21 508 add v18.4s,v18.4s,v19.4s 509 eor w9,w9,w13 510 add v22.4s,v22.4s,v23.4s 511 eor w10,w10,w14 512 add v26.4s,v26.4s,v27.4s 513 eor w11,w11,w15 514 add v30.4s,v30.4s,v31.4s 515 eor w12,w12,w16 516 eor v4.16b,v17.16b,v18.16b 517 ror w9,w9,#25 518 eor v5.16b,v21.16b,v22.16b 519 ror w10,w10,#25 520 eor v6.16b,v25.16b,v26.16b 521 ror w11,w11,#25 522 eor v7.16b,v29.16b,v30.16b 523 ror w12,w12,#25 524 ushr v17.4s,v4.4s,#25 525 ushr v21.4s,v5.4s,#25 526 ushr v25.4s,v6.4s,#25 527 ushr v29.4s,v7.4s,#25 528 sli v17.4s,v4.4s,#7 529 sli v21.4s,v5.4s,#7 530 sli v25.4s,v6.4s,#7 531 sli v29.4s,v7.4s,#7 532 add v16.4s,v16.4s,v21.4s 533 add w5,w5,w10 534 add v20.4s,v20.4s,v25.4s 535 add w6,w6,w11 536 add v24.4s,v24.4s,v29.4s 537 add w7,w7,w12 538 add v28.4s,v28.4s,v17.4s 539 add w8,w8,w9 540 eor v31.16b,v31.16b,v16.16b 541 eor w21,w21,w5 542 eor v19.16b,v19.16b,v20.16b 543 eor w17,w17,w6 544 eor v23.16b,v23.16b,v24.16b 545 eor w19,w19,w7 546 eor v27.16b,v27.16b,v28.16b 547 eor w20,w20,w8 548 rev32 v31.8h,v31.8h 549 ror w21,w21,#16 550 rev32 v19.8h,v19.8h 551 ror w17,w17,#16 552 rev32 v23.8h,v23.8h 553 ror w19,w19,#16 554 rev32 v27.8h,v27.8h 555 ror w20,w20,#16 556 add v26.4s,v26.4s,v31.4s 557 add w15,w15,w21 558 add v30.4s,v30.4s,v19.4s 559 add w16,w16,w17 560 add v18.4s,v18.4s,v23.4s 561 add w13,w13,w19 562 add v22.4s,v22.4s,v27.4s 563 add w14,w14,w20 564 eor v4.16b,v21.16b,v26.16b 565 eor w10,w10,w15 566 eor v5.16b,v25.16b,v30.16b 567 eor w11,w11,w16 568 eor v6.16b,v29.16b,v18.16b 569 eor w12,w12,w13 570 eor v7.16b,v17.16b,v22.16b 571 eor w9,w9,w14 572 ushr v21.4s,v4.4s,#20 573 ror w10,w10,#20 574 ushr v25.4s,v5.4s,#20 575 ror w11,w11,#20 576 ushr v29.4s,v6.4s,#20 577 ror w12,w12,#20 578 ushr v17.4s,v7.4s,#20 579 ror w9,w9,#20 580 sli v21.4s,v4.4s,#12 581 add w5,w5,w10 582 sli v25.4s,v5.4s,#12 583 add w6,w6,w11 584 sli v29.4s,v6.4s,#12 585 add w7,w7,w12 586 sli v17.4s,v7.4s,#12 587 add w8,w8,w9 588 add v16.4s,v16.4s,v21.4s 589 eor w21,w21,w5 590 add v20.4s,v20.4s,v25.4s 591 eor w17,w17,w6 592 add v24.4s,v24.4s,v29.4s 593 eor w19,w19,w7 594 add v28.4s,v28.4s,v17.4s 595 eor w20,w20,w8 596 eor v4.16b,v31.16b,v16.16b 597 ror w21,w21,#24 598 eor v5.16b,v19.16b,v20.16b 599 ror w17,w17,#24 600 eor v6.16b,v23.16b,v24.16b 601 ror w19,w19,#24 602 eor v7.16b,v27.16b,v28.16b 603 ror w20,w20,#24 604 tbl v31.16b,{v4.16b},v9.16b 605 add w15,w15,w21 606 tbl v19.16b,{v5.16b},v9.16b 607 add w16,w16,w17 608 tbl v23.16b,{v6.16b},v9.16b 609 add w13,w13,w19 610 tbl v27.16b,{v7.16b},v9.16b 611 add w14,w14,w20 612 add v26.4s,v26.4s,v31.4s 613 eor w10,w10,w15 614 add v30.4s,v30.4s,v19.4s 615 eor w11,w11,w16 616 add v18.4s,v18.4s,v23.4s 617 eor w12,w12,w13 618 add v22.4s,v22.4s,v27.4s 619 eor w9,w9,w14 620 eor v4.16b,v21.16b,v26.16b 621 ror w10,w10,#25 622 eor v5.16b,v25.16b,v30.16b 623 ror w11,w11,#25 624 eor v6.16b,v29.16b,v18.16b 625 ror w12,w12,#25 626 eor v7.16b,v17.16b,v22.16b 627 ror w9,w9,#25 628 ushr v21.4s,v4.4s,#25 629 ushr v25.4s,v5.4s,#25 630 ushr v29.4s,v6.4s,#25 631 ushr v17.4s,v7.4s,#25 632 sli v21.4s,v4.4s,#7 633 sli v25.4s,v5.4s,#7 634 sli v29.4s,v6.4s,#7 635 sli v17.4s,v7.4s,#7 636 cbnz x4,.Loop_neon 637 638 add v19.4s,v19.4s,v8.4s 639 640 zip1 v4.4s,v16.4s,v20.4s // transpose data 641 zip1 v5.4s,v24.4s,v28.4s 642 zip2 v6.4s,v16.4s,v20.4s 643 zip2 v7.4s,v24.4s,v28.4s 644 zip1 v16.2d,v4.2d,v5.2d 645 zip2 v20.2d,v4.2d,v5.2d 646 zip1 v24.2d,v6.2d,v7.2d 647 zip2 v28.2d,v6.2d,v7.2d 648 649 zip1 v4.4s,v17.4s,v21.4s 650 zip1 v5.4s,v25.4s,v29.4s 651 zip2 v6.4s,v17.4s,v21.4s 652 zip2 v7.4s,v25.4s,v29.4s 653 zip1 v17.2d,v4.2d,v5.2d 654 zip2 v21.2d,v4.2d,v5.2d 655 zip1 v25.2d,v6.2d,v7.2d 656 zip2 v29.2d,v6.2d,v7.2d 657 658 zip1 v4.4s,v18.4s,v22.4s 659 add w5,w5,w22 // accumulate key block 660 zip1 v5.4s,v26.4s,v30.4s 661 add x6,x6,x22,lsr#32 662 zip2 v6.4s,v18.4s,v22.4s 663 add w7,w7,w23 664 zip2 v7.4s,v26.4s,v30.4s 665 add x8,x8,x23,lsr#32 666 zip1 v18.2d,v4.2d,v5.2d 667 add w9,w9,w24 668 zip2 v22.2d,v4.2d,v5.2d 669 add x10,x10,x24,lsr#32 670 zip1 v26.2d,v6.2d,v7.2d 671 add w11,w11,w25 672 zip2 v30.2d,v6.2d,v7.2d 673 add x12,x12,x25,lsr#32 674 675 zip1 v4.4s,v19.4s,v23.4s 676 add w13,w13,w26 677 zip1 v5.4s,v27.4s,v31.4s 678 add x14,x14,x26,lsr#32 679 zip2 v6.4s,v19.4s,v23.4s 680 add w15,w15,w27 681 zip2 v7.4s,v27.4s,v31.4s 682 add x16,x16,x27,lsr#32 683 zip1 v19.2d,v4.2d,v5.2d 684 add w17,w17,w28 685 zip2 v23.2d,v4.2d,v5.2d 686 add x19,x19,x28,lsr#32 687 zip1 v27.2d,v6.2d,v7.2d 688 add w20,w20,w30 689 zip2 v31.2d,v6.2d,v7.2d 690 add x21,x21,x30,lsr#32 691 692 b.lo .Ltail_neon 693 694 add x5,x5,x6,lsl#32 // pack 695 add x7,x7,x8,lsl#32 696 ldp x6,x8,[x1,#0] // load input 697 add v16.4s,v16.4s,v0.4s // accumulate key block 698 add x9,x9,x10,lsl#32 699 add x11,x11,x12,lsl#32 700 ldp x10,x12,[x1,#16] 701 add v17.4s,v17.4s,v1.4s 702 add x13,x13,x14,lsl#32 703 add x15,x15,x16,lsl#32 704 ldp x14,x16,[x1,#32] 705 add v18.4s,v18.4s,v2.4s 706 add x17,x17,x19,lsl#32 707 add x20,x20,x21,lsl#32 708 ldp x19,x21,[x1,#48] 709 add v19.4s,v19.4s,v3.4s 710 add x1,x1,#64 711 #ifdef __AARCH64EB__ 712 rev x5,x5 713 rev x7,x7 714 rev x9,x9 715 rev x11,x11 716 rev x13,x13 717 rev x15,x15 718 rev x17,x17 719 rev x20,x20 720 #endif 721 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 722 eor x5,x5,x6 723 add v20.4s,v20.4s,v0.4s 724 eor x7,x7,x8 725 add v21.4s,v21.4s,v1.4s 726 eor x9,x9,x10 727 add v22.4s,v22.4s,v2.4s 728 eor x11,x11,x12 729 add v23.4s,v23.4s,v3.4s 730 eor x13,x13,x14 731 eor v16.16b,v16.16b,v4.16b 732 movi v4.4s,#5 733 eor x15,x15,x16 734 eor v17.16b,v17.16b,v5.16b 735 eor x17,x17,x19 736 eor v18.16b,v18.16b,v6.16b 737 eor x20,x20,x21 738 eor v19.16b,v19.16b,v7.16b 739 add v8.4s,v8.4s,v4.4s // += 5 740 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 741 742 stp x5,x7,[x0,#0] // store output 743 add x28,x28,#5 // increment counter 744 stp x9,x11,[x0,#16] 745 stp x13,x15,[x0,#32] 746 stp x17,x20,[x0,#48] 747 add x0,x0,#64 748 749 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 750 add v24.4s,v24.4s,v0.4s 751 add v25.4s,v25.4s,v1.4s 752 add v26.4s,v26.4s,v2.4s 753 add v27.4s,v27.4s,v3.4s 754 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 755 756 eor v20.16b,v20.16b,v4.16b 757 eor v21.16b,v21.16b,v5.16b 758 eor v22.16b,v22.16b,v6.16b 759 eor v23.16b,v23.16b,v7.16b 760 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 761 add v28.4s,v28.4s,v0.4s 762 add v29.4s,v29.4s,v1.4s 763 add v30.4s,v30.4s,v2.4s 764 add v31.4s,v31.4s,v3.4s 765 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 766 767 eor v24.16b,v24.16b,v16.16b 768 eor v25.16b,v25.16b,v17.16b 769 eor v26.16b,v26.16b,v18.16b 770 eor v27.16b,v27.16b,v19.16b 771 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 772 773 eor v28.16b,v28.16b,v20.16b 774 eor v29.16b,v29.16b,v21.16b 775 eor v30.16b,v30.16b,v22.16b 776 eor v31.16b,v31.16b,v23.16b 777 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 778 779 b.hi .Loop_outer_neon 780 781 ldp d8,d9,[sp] // meet ABI requirements 782 783 ldp x19,x20,[x29,#16] 784 add sp,sp,#64 785 ldp x21,x22,[x29,#32] 786 ldp x23,x24,[x29,#48] 787 ldp x25,x26,[x29,#64] 788 ldp x27,x28,[x29,#80] 789 ldp x29,x30,[sp],#96 790 AARCH64_VALIDATE_LINK_REGISTER 791 ret 792 793 .align 4 794 .Ltail_neon: 795 add x2,x2,#320 796 ldp d8,d9,[sp] // meet ABI requirements 797 cmp x2,#64 798 b.lo .Less_than_64 799 800 add x5,x5,x6,lsl#32 // pack 801 add x7,x7,x8,lsl#32 802 ldp x6,x8,[x1,#0] // load input 803 add x9,x9,x10,lsl#32 804 add x11,x11,x12,lsl#32 805 ldp x10,x12,[x1,#16] 806 add x13,x13,x14,lsl#32 807 add x15,x15,x16,lsl#32 808 ldp x14,x16,[x1,#32] 809 add x17,x17,x19,lsl#32 810 add x20,x20,x21,lsl#32 811 ldp x19,x21,[x1,#48] 812 add x1,x1,#64 813 #ifdef __AARCH64EB__ 814 rev x5,x5 815 rev x7,x7 816 rev x9,x9 817 rev x11,x11 818 rev x13,x13 819 rev x15,x15 820 rev x17,x17 821 rev x20,x20 822 #endif 823 eor x5,x5,x6 824 eor x7,x7,x8 825 eor x9,x9,x10 826 eor x11,x11,x12 827 eor x13,x13,x14 828 eor x15,x15,x16 829 eor x17,x17,x19 830 eor x20,x20,x21 831 832 stp x5,x7,[x0,#0] // store output 833 add v16.4s,v16.4s,v0.4s // accumulate key block 834 stp x9,x11,[x0,#16] 835 add v17.4s,v17.4s,v1.4s 836 stp x13,x15,[x0,#32] 837 add v18.4s,v18.4s,v2.4s 838 stp x17,x20,[x0,#48] 839 add v19.4s,v19.4s,v3.4s 840 add x0,x0,#64 841 b.eq .Ldone_neon 842 sub x2,x2,#64 843 cmp x2,#64 844 b.lo .Last_neon 845 846 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 847 eor v16.16b,v16.16b,v4.16b 848 eor v17.16b,v17.16b,v5.16b 849 eor v18.16b,v18.16b,v6.16b 850 eor v19.16b,v19.16b,v7.16b 851 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 852 b.eq .Ldone_neon 853 854 add v16.4s,v20.4s,v0.4s 855 add v17.4s,v21.4s,v1.4s 856 sub x2,x2,#64 857 add v18.4s,v22.4s,v2.4s 858 cmp x2,#64 859 add v19.4s,v23.4s,v3.4s 860 b.lo .Last_neon 861 862 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 863 eor v20.16b,v16.16b,v4.16b 864 eor v21.16b,v17.16b,v5.16b 865 eor v22.16b,v18.16b,v6.16b 866 eor v23.16b,v19.16b,v7.16b 867 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 868 b.eq .Ldone_neon 869 870 add v16.4s,v24.4s,v0.4s 871 add v17.4s,v25.4s,v1.4s 872 sub x2,x2,#64 873 add v18.4s,v26.4s,v2.4s 874 cmp x2,#64 875 add v19.4s,v27.4s,v3.4s 876 b.lo .Last_neon 877 878 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 879 eor v24.16b,v16.16b,v4.16b 880 eor v25.16b,v17.16b,v5.16b 881 eor v26.16b,v18.16b,v6.16b 882 eor v27.16b,v19.16b,v7.16b 883 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 884 b.eq .Ldone_neon 885 886 add v16.4s,v28.4s,v0.4s 887 add v17.4s,v29.4s,v1.4s 888 add v18.4s,v30.4s,v2.4s 889 add v19.4s,v31.4s,v3.4s 890 sub x2,x2,#64 891 892 .Last_neon: 893 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 894 895 sub x0,x0,#1 896 add x1,x1,x2 897 add x0,x0,x2 898 add x4,sp,x2 899 neg x2,x2 900 901 .Loop_tail_neon: 902 ldrb w10,[x1,x2] 903 ldrb w11,[x4,x2] 904 add x2,x2,#1 905 eor w10,w10,w11 906 strb w10,[x0,x2] 907 cbnz x2,.Loop_tail_neon 908 909 stp xzr,xzr,[sp,#0] 910 stp xzr,xzr,[sp,#16] 911 stp xzr,xzr,[sp,#32] 912 stp xzr,xzr,[sp,#48] 913 914 .Ldone_neon: 915 ldp x19,x20,[x29,#16] 916 add sp,sp,#64 917 ldp x21,x22,[x29,#32] 918 ldp x23,x24,[x29,#48] 919 ldp x25,x26,[x29,#64] 920 ldp x27,x28,[x29,#80] 921 ldp x29,x30,[sp],#96 922 AARCH64_VALIDATE_LINK_REGISTER 923 ret 924 .size ChaCha20_neon,.-ChaCha20_neon 925 .type ChaCha20_512_neon,%function 926 .align 5 927 ChaCha20_512_neon: 928 AARCH64_SIGN_LINK_REGISTER 929 stp x29,x30,[sp,#-96]! 930 add x29,sp,#0 931 932 adrp x5,.Lsigma 933 add x5,x5,#:lo12:.Lsigma 934 stp x19,x20,[sp,#16] 935 stp x21,x22,[sp,#32] 936 stp x23,x24,[sp,#48] 937 stp x25,x26,[sp,#64] 938 stp x27,x28,[sp,#80] 939 940 .L512_or_more_neon: 941 sub sp,sp,#128+64 942 943 eor v7.16b,v7.16b,v7.16b 944 ldp x22,x23,[x5] // load sigma 945 ld1 {v0.4s},[x5],#16 946 ldp x24,x25,[x3] // load key 947 ldp x26,x27,[x3,#16] 948 ld1 {v1.4s,v2.4s},[x3] 949 ldp x28,x30,[x4] // load counter 950 ld1 {v3.4s},[x4] 951 ld1 {v7.s}[0],[x5] 952 add x3,x5,#16 // .Lrot24 953 #ifdef __AARCH64EB__ 954 rev64 v0.4s,v0.4s 955 ror x24,x24,#32 956 ror x25,x25,#32 957 ror x26,x26,#32 958 ror x27,x27,#32 959 ror x28,x28,#32 960 ror x30,x30,#32 961 #endif 962 add v3.4s,v3.4s,v7.4s // += 1 963 stp q0,q1,[sp,#0] // off-load key block, invariant part 964 add v3.4s,v3.4s,v7.4s // not typo 965 str q2,[sp,#32] 966 add v4.4s,v3.4s,v7.4s 967 add v5.4s,v4.4s,v7.4s 968 add v6.4s,v5.4s,v7.4s 969 shl v7.4s,v7.4s,#2 // 1 -> 4 970 971 stp d8,d9,[sp,#128+0] // meet ABI requirements 972 stp d10,d11,[sp,#128+16] 973 stp d12,d13,[sp,#128+32] 974 stp d14,d15,[sp,#128+48] 975 976 sub x2,x2,#512 // not typo 977 978 .Loop_outer_512_neon: 979 mov v8.16b,v0.16b 980 mov v12.16b,v0.16b 981 mov v16.16b,v0.16b 982 mov v20.16b,v0.16b 983 mov v24.16b,v0.16b 984 mov v28.16b,v0.16b 985 mov v9.16b,v1.16b 986 mov w5,w22 // unpack key block 987 mov v13.16b,v1.16b 988 lsr x6,x22,#32 989 mov v17.16b,v1.16b 990 mov w7,w23 991 mov v21.16b,v1.16b 992 lsr x8,x23,#32 993 mov v25.16b,v1.16b 994 mov w9,w24 995 mov v29.16b,v1.16b 996 lsr x10,x24,#32 997 mov v11.16b,v3.16b 998 mov w11,w25 999 mov v15.16b,v4.16b 1000 lsr x12,x25,#32 1001 mov v19.16b,v5.16b 1002 mov w13,w26 1003 mov v23.16b,v6.16b 1004 lsr x14,x26,#32 1005 mov v10.16b,v2.16b 1006 mov w15,w27 1007 mov v14.16b,v2.16b 1008 lsr x16,x27,#32 1009 add v27.4s,v11.4s,v7.4s // +4 1010 mov w17,w28 1011 add v31.4s,v15.4s,v7.4s // +4 1012 lsr x19,x28,#32 1013 mov v18.16b,v2.16b 1014 mov w20,w30 1015 mov v22.16b,v2.16b 1016 lsr x21,x30,#32 1017 mov v26.16b,v2.16b 1018 stp q3,q4,[sp,#48] // off-load key block, variable part 1019 mov v30.16b,v2.16b 1020 stp q5,q6,[sp,#80] 1021 1022 mov x4,#5 1023 ld1 {v6.4s},[x3] 1024 subs x2,x2,#512 1025 .Loop_upper_neon: 1026 sub x4,x4,#1 1027 add v8.4s,v8.4s,v9.4s 1028 add w5,w5,w9 1029 add v12.4s,v12.4s,v13.4s 1030 add w6,w6,w10 1031 add v16.4s,v16.4s,v17.4s 1032 add w7,w7,w11 1033 add v20.4s,v20.4s,v21.4s 1034 add w8,w8,w12 1035 add v24.4s,v24.4s,v25.4s 1036 eor w17,w17,w5 1037 add v28.4s,v28.4s,v29.4s 1038 eor w19,w19,w6 1039 eor v11.16b,v11.16b,v8.16b 1040 eor w20,w20,w7 1041 eor v15.16b,v15.16b,v12.16b 1042 eor w21,w21,w8 1043 eor v19.16b,v19.16b,v16.16b 1044 ror w17,w17,#16 1045 eor v23.16b,v23.16b,v20.16b 1046 ror w19,w19,#16 1047 eor v27.16b,v27.16b,v24.16b 1048 ror w20,w20,#16 1049 eor v31.16b,v31.16b,v28.16b 1050 ror w21,w21,#16 1051 rev32 v11.8h,v11.8h 1052 add w13,w13,w17 1053 rev32 v15.8h,v15.8h 1054 add w14,w14,w19 1055 rev32 v19.8h,v19.8h 1056 add w15,w15,w20 1057 rev32 v23.8h,v23.8h 1058 add w16,w16,w21 1059 rev32 v27.8h,v27.8h 1060 eor w9,w9,w13 1061 rev32 v31.8h,v31.8h 1062 eor w10,w10,w14 1063 add v10.4s,v10.4s,v11.4s 1064 eor w11,w11,w15 1065 add v14.4s,v14.4s,v15.4s 1066 eor w12,w12,w16 1067 add v18.4s,v18.4s,v19.4s 1068 ror w9,w9,#20 1069 add v22.4s,v22.4s,v23.4s 1070 ror w10,w10,#20 1071 add v26.4s,v26.4s,v27.4s 1072 ror w11,w11,#20 1073 add v30.4s,v30.4s,v31.4s 1074 ror w12,w12,#20 1075 eor v0.16b,v9.16b,v10.16b 1076 add w5,w5,w9 1077 eor v1.16b,v13.16b,v14.16b 1078 add w6,w6,w10 1079 eor v2.16b,v17.16b,v18.16b 1080 add w7,w7,w11 1081 eor v3.16b,v21.16b,v22.16b 1082 add w8,w8,w12 1083 eor v4.16b,v25.16b,v26.16b 1084 eor w17,w17,w5 1085 eor v5.16b,v29.16b,v30.16b 1086 eor w19,w19,w6 1087 ushr v9.4s,v0.4s,#20 1088 eor w20,w20,w7 1089 ushr v13.4s,v1.4s,#20 1090 eor w21,w21,w8 1091 ushr v17.4s,v2.4s,#20 1092 ror w17,w17,#24 1093 ushr v21.4s,v3.4s,#20 1094 ror w19,w19,#24 1095 ushr v25.4s,v4.4s,#20 1096 ror w20,w20,#24 1097 ushr v29.4s,v5.4s,#20 1098 ror w21,w21,#24 1099 sli v9.4s,v0.4s,#12 1100 add w13,w13,w17 1101 sli v13.4s,v1.4s,#12 1102 add w14,w14,w19 1103 sli v17.4s,v2.4s,#12 1104 add w15,w15,w20 1105 sli v21.4s,v3.4s,#12 1106 add w16,w16,w21 1107 sli v25.4s,v4.4s,#12 1108 eor w9,w9,w13 1109 sli v29.4s,v5.4s,#12 1110 eor w10,w10,w14 1111 add v8.4s,v8.4s,v9.4s 1112 eor w11,w11,w15 1113 add v12.4s,v12.4s,v13.4s 1114 eor w12,w12,w16 1115 add v16.4s,v16.4s,v17.4s 1116 ror w9,w9,#25 1117 add v20.4s,v20.4s,v21.4s 1118 ror w10,w10,#25 1119 add v24.4s,v24.4s,v25.4s 1120 ror w11,w11,#25 1121 add v28.4s,v28.4s,v29.4s 1122 ror w12,w12,#25 1123 eor v11.16b,v11.16b,v8.16b 1124 add w5,w5,w10 1125 eor v15.16b,v15.16b,v12.16b 1126 add w6,w6,w11 1127 eor v19.16b,v19.16b,v16.16b 1128 add w7,w7,w12 1129 eor v23.16b,v23.16b,v20.16b 1130 add w8,w8,w9 1131 eor v27.16b,v27.16b,v24.16b 1132 eor w21,w21,w5 1133 eor v31.16b,v31.16b,v28.16b 1134 eor w17,w17,w6 1135 tbl v11.16b,{v11.16b},v6.16b 1136 eor w19,w19,w7 1137 tbl v15.16b,{v15.16b},v6.16b 1138 eor w20,w20,w8 1139 tbl v19.16b,{v19.16b},v6.16b 1140 ror w21,w21,#16 1141 tbl v23.16b,{v23.16b},v6.16b 1142 ror w17,w17,#16 1143 tbl v27.16b,{v27.16b},v6.16b 1144 ror w19,w19,#16 1145 tbl v31.16b,{v31.16b},v6.16b 1146 ror w20,w20,#16 1147 add v10.4s,v10.4s,v11.4s 1148 add w15,w15,w21 1149 add v14.4s,v14.4s,v15.4s 1150 add w16,w16,w17 1151 add v18.4s,v18.4s,v19.4s 1152 add w13,w13,w19 1153 add v22.4s,v22.4s,v23.4s 1154 add w14,w14,w20 1155 add v26.4s,v26.4s,v27.4s 1156 eor w10,w10,w15 1157 add v30.4s,v30.4s,v31.4s 1158 eor w11,w11,w16 1159 eor v0.16b,v9.16b,v10.16b 1160 eor w12,w12,w13 1161 eor v1.16b,v13.16b,v14.16b 1162 eor w9,w9,w14 1163 eor v2.16b,v17.16b,v18.16b 1164 ror w10,w10,#20 1165 eor v3.16b,v21.16b,v22.16b 1166 ror w11,w11,#20 1167 eor v4.16b,v25.16b,v26.16b 1168 ror w12,w12,#20 1169 eor v5.16b,v29.16b,v30.16b 1170 ror w9,w9,#20 1171 ushr v9.4s,v0.4s,#25 1172 add w5,w5,w10 1173 ushr v13.4s,v1.4s,#25 1174 add w6,w6,w11 1175 ushr v17.4s,v2.4s,#25 1176 add w7,w7,w12 1177 ushr v21.4s,v3.4s,#25 1178 add w8,w8,w9 1179 ushr v25.4s,v4.4s,#25 1180 eor w21,w21,w5 1181 ushr v29.4s,v5.4s,#25 1182 eor w17,w17,w6 1183 sli v9.4s,v0.4s,#7 1184 eor w19,w19,w7 1185 sli v13.4s,v1.4s,#7 1186 eor w20,w20,w8 1187 sli v17.4s,v2.4s,#7 1188 ror w21,w21,#24 1189 sli v21.4s,v3.4s,#7 1190 ror w17,w17,#24 1191 sli v25.4s,v4.4s,#7 1192 ror w19,w19,#24 1193 sli v29.4s,v5.4s,#7 1194 ror w20,w20,#24 1195 ext v10.16b,v10.16b,v10.16b,#8 1196 add w15,w15,w21 1197 ext v14.16b,v14.16b,v14.16b,#8 1198 add w16,w16,w17 1199 ext v18.16b,v18.16b,v18.16b,#8 1200 add w13,w13,w19 1201 ext v22.16b,v22.16b,v22.16b,#8 1202 add w14,w14,w20 1203 ext v26.16b,v26.16b,v26.16b,#8 1204 eor w10,w10,w15 1205 ext v30.16b,v30.16b,v30.16b,#8 1206 eor w11,w11,w16 1207 ext v11.16b,v11.16b,v11.16b,#12 1208 eor w12,w12,w13 1209 ext v15.16b,v15.16b,v15.16b,#12 1210 eor w9,w9,w14 1211 ext v19.16b,v19.16b,v19.16b,#12 1212 ror w10,w10,#25 1213 ext v23.16b,v23.16b,v23.16b,#12 1214 ror w11,w11,#25 1215 ext v27.16b,v27.16b,v27.16b,#12 1216 ror w12,w12,#25 1217 ext v31.16b,v31.16b,v31.16b,#12 1218 ror w9,w9,#25 1219 ext v9.16b,v9.16b,v9.16b,#4 1220 ext v13.16b,v13.16b,v13.16b,#4 1221 ext v17.16b,v17.16b,v17.16b,#4 1222 ext v21.16b,v21.16b,v21.16b,#4 1223 ext v25.16b,v25.16b,v25.16b,#4 1224 ext v29.16b,v29.16b,v29.16b,#4 1225 add v8.4s,v8.4s,v9.4s 1226 add w5,w5,w9 1227 add v12.4s,v12.4s,v13.4s 1228 add w6,w6,w10 1229 add v16.4s,v16.4s,v17.4s 1230 add w7,w7,w11 1231 add v20.4s,v20.4s,v21.4s 1232 add w8,w8,w12 1233 add v24.4s,v24.4s,v25.4s 1234 eor w17,w17,w5 1235 add v28.4s,v28.4s,v29.4s 1236 eor w19,w19,w6 1237 eor v11.16b,v11.16b,v8.16b 1238 eor w20,w20,w7 1239 eor v15.16b,v15.16b,v12.16b 1240 eor w21,w21,w8 1241 eor v19.16b,v19.16b,v16.16b 1242 ror w17,w17,#16 1243 eor v23.16b,v23.16b,v20.16b 1244 ror w19,w19,#16 1245 eor v27.16b,v27.16b,v24.16b 1246 ror w20,w20,#16 1247 eor v31.16b,v31.16b,v28.16b 1248 ror w21,w21,#16 1249 rev32 v11.8h,v11.8h 1250 add w13,w13,w17 1251 rev32 v15.8h,v15.8h 1252 add w14,w14,w19 1253 rev32 v19.8h,v19.8h 1254 add w15,w15,w20 1255 rev32 v23.8h,v23.8h 1256 add w16,w16,w21 1257 rev32 v27.8h,v27.8h 1258 eor w9,w9,w13 1259 rev32 v31.8h,v31.8h 1260 eor w10,w10,w14 1261 add v10.4s,v10.4s,v11.4s 1262 eor w11,w11,w15 1263 add v14.4s,v14.4s,v15.4s 1264 eor w12,w12,w16 1265 add v18.4s,v18.4s,v19.4s 1266 ror w9,w9,#20 1267 add v22.4s,v22.4s,v23.4s 1268 ror w10,w10,#20 1269 add v26.4s,v26.4s,v27.4s 1270 ror w11,w11,#20 1271 add v30.4s,v30.4s,v31.4s 1272 ror w12,w12,#20 1273 eor v0.16b,v9.16b,v10.16b 1274 add w5,w5,w9 1275 eor v1.16b,v13.16b,v14.16b 1276 add w6,w6,w10 1277 eor v2.16b,v17.16b,v18.16b 1278 add w7,w7,w11 1279 eor v3.16b,v21.16b,v22.16b 1280 add w8,w8,w12 1281 eor v4.16b,v25.16b,v26.16b 1282 eor w17,w17,w5 1283 eor v5.16b,v29.16b,v30.16b 1284 eor w19,w19,w6 1285 ushr v9.4s,v0.4s,#20 1286 eor w20,w20,w7 1287 ushr v13.4s,v1.4s,#20 1288 eor w21,w21,w8 1289 ushr v17.4s,v2.4s,#20 1290 ror w17,w17,#24 1291 ushr v21.4s,v3.4s,#20 1292 ror w19,w19,#24 1293 ushr v25.4s,v4.4s,#20 1294 ror w20,w20,#24 1295 ushr v29.4s,v5.4s,#20 1296 ror w21,w21,#24 1297 sli v9.4s,v0.4s,#12 1298 add w13,w13,w17 1299 sli v13.4s,v1.4s,#12 1300 add w14,w14,w19 1301 sli v17.4s,v2.4s,#12 1302 add w15,w15,w20 1303 sli v21.4s,v3.4s,#12 1304 add w16,w16,w21 1305 sli v25.4s,v4.4s,#12 1306 eor w9,w9,w13 1307 sli v29.4s,v5.4s,#12 1308 eor w10,w10,w14 1309 add v8.4s,v8.4s,v9.4s 1310 eor w11,w11,w15 1311 add v12.4s,v12.4s,v13.4s 1312 eor w12,w12,w16 1313 add v16.4s,v16.4s,v17.4s 1314 ror w9,w9,#25 1315 add v20.4s,v20.4s,v21.4s 1316 ror w10,w10,#25 1317 add v24.4s,v24.4s,v25.4s 1318 ror w11,w11,#25 1319 add v28.4s,v28.4s,v29.4s 1320 ror w12,w12,#25 1321 eor v11.16b,v11.16b,v8.16b 1322 add w5,w5,w10 1323 eor v15.16b,v15.16b,v12.16b 1324 add w6,w6,w11 1325 eor v19.16b,v19.16b,v16.16b 1326 add w7,w7,w12 1327 eor v23.16b,v23.16b,v20.16b 1328 add w8,w8,w9 1329 eor v27.16b,v27.16b,v24.16b 1330 eor w21,w21,w5 1331 eor v31.16b,v31.16b,v28.16b 1332 eor w17,w17,w6 1333 tbl v11.16b,{v11.16b},v6.16b 1334 eor w19,w19,w7 1335 tbl v15.16b,{v15.16b},v6.16b 1336 eor w20,w20,w8 1337 tbl v19.16b,{v19.16b},v6.16b 1338 ror w21,w21,#16 1339 tbl v23.16b,{v23.16b},v6.16b 1340 ror w17,w17,#16 1341 tbl v27.16b,{v27.16b},v6.16b 1342 ror w19,w19,#16 1343 tbl v31.16b,{v31.16b},v6.16b 1344 ror w20,w20,#16 1345 add v10.4s,v10.4s,v11.4s 1346 add w15,w15,w21 1347 add v14.4s,v14.4s,v15.4s 1348 add w16,w16,w17 1349 add v18.4s,v18.4s,v19.4s 1350 add w13,w13,w19 1351 add v22.4s,v22.4s,v23.4s 1352 add w14,w14,w20 1353 add v26.4s,v26.4s,v27.4s 1354 eor w10,w10,w15 1355 add v30.4s,v30.4s,v31.4s 1356 eor w11,w11,w16 1357 eor v0.16b,v9.16b,v10.16b 1358 eor w12,w12,w13 1359 eor v1.16b,v13.16b,v14.16b 1360 eor w9,w9,w14 1361 eor v2.16b,v17.16b,v18.16b 1362 ror w10,w10,#20 1363 eor v3.16b,v21.16b,v22.16b 1364 ror w11,w11,#20 1365 eor v4.16b,v25.16b,v26.16b 1366 ror w12,w12,#20 1367 eor v5.16b,v29.16b,v30.16b 1368 ror w9,w9,#20 1369 ushr v9.4s,v0.4s,#25 1370 add w5,w5,w10 1371 ushr v13.4s,v1.4s,#25 1372 add w6,w6,w11 1373 ushr v17.4s,v2.4s,#25 1374 add w7,w7,w12 1375 ushr v21.4s,v3.4s,#25 1376 add w8,w8,w9 1377 ushr v25.4s,v4.4s,#25 1378 eor w21,w21,w5 1379 ushr v29.4s,v5.4s,#25 1380 eor w17,w17,w6 1381 sli v9.4s,v0.4s,#7 1382 eor w19,w19,w7 1383 sli v13.4s,v1.4s,#7 1384 eor w20,w20,w8 1385 sli v17.4s,v2.4s,#7 1386 ror w21,w21,#24 1387 sli v21.4s,v3.4s,#7 1388 ror w17,w17,#24 1389 sli v25.4s,v4.4s,#7 1390 ror w19,w19,#24 1391 sli v29.4s,v5.4s,#7 1392 ror w20,w20,#24 1393 ext v10.16b,v10.16b,v10.16b,#8 1394 add w15,w15,w21 1395 ext v14.16b,v14.16b,v14.16b,#8 1396 add w16,w16,w17 1397 ext v18.16b,v18.16b,v18.16b,#8 1398 add w13,w13,w19 1399 ext v22.16b,v22.16b,v22.16b,#8 1400 add w14,w14,w20 1401 ext v26.16b,v26.16b,v26.16b,#8 1402 eor w10,w10,w15 1403 ext v30.16b,v30.16b,v30.16b,#8 1404 eor w11,w11,w16 1405 ext v11.16b,v11.16b,v11.16b,#4 1406 eor w12,w12,w13 1407 ext v15.16b,v15.16b,v15.16b,#4 1408 eor w9,w9,w14 1409 ext v19.16b,v19.16b,v19.16b,#4 1410 ror w10,w10,#25 1411 ext v23.16b,v23.16b,v23.16b,#4 1412 ror w11,w11,#25 1413 ext v27.16b,v27.16b,v27.16b,#4 1414 ror w12,w12,#25 1415 ext v31.16b,v31.16b,v31.16b,#4 1416 ror w9,w9,#25 1417 ext v9.16b,v9.16b,v9.16b,#12 1418 ext v13.16b,v13.16b,v13.16b,#12 1419 ext v17.16b,v17.16b,v17.16b,#12 1420 ext v21.16b,v21.16b,v21.16b,#12 1421 ext v25.16b,v25.16b,v25.16b,#12 1422 ext v29.16b,v29.16b,v29.16b,#12 1423 cbnz x4,.Loop_upper_neon 1424 1425 add w5,w5,w22 // accumulate key block 1426 add x6,x6,x22,lsr#32 1427 add w7,w7,w23 1428 add x8,x8,x23,lsr#32 1429 add w9,w9,w24 1430 add x10,x10,x24,lsr#32 1431 add w11,w11,w25 1432 add x12,x12,x25,lsr#32 1433 add w13,w13,w26 1434 add x14,x14,x26,lsr#32 1435 add w15,w15,w27 1436 add x16,x16,x27,lsr#32 1437 add w17,w17,w28 1438 add x19,x19,x28,lsr#32 1439 add w20,w20,w30 1440 add x21,x21,x30,lsr#32 1441 1442 add x5,x5,x6,lsl#32 // pack 1443 add x7,x7,x8,lsl#32 1444 ldp x6,x8,[x1,#0] // load input 1445 add x9,x9,x10,lsl#32 1446 add x11,x11,x12,lsl#32 1447 ldp x10,x12,[x1,#16] 1448 add x13,x13,x14,lsl#32 1449 add x15,x15,x16,lsl#32 1450 ldp x14,x16,[x1,#32] 1451 add x17,x17,x19,lsl#32 1452 add x20,x20,x21,lsl#32 1453 ldp x19,x21,[x1,#48] 1454 add x1,x1,#64 1455 #ifdef __AARCH64EB__ 1456 rev x5,x5 1457 rev x7,x7 1458 rev x9,x9 1459 rev x11,x11 1460 rev x13,x13 1461 rev x15,x15 1462 rev x17,x17 1463 rev x20,x20 1464 #endif 1465 eor x5,x5,x6 1466 eor x7,x7,x8 1467 eor x9,x9,x10 1468 eor x11,x11,x12 1469 eor x13,x13,x14 1470 eor x15,x15,x16 1471 eor x17,x17,x19 1472 eor x20,x20,x21 1473 1474 stp x5,x7,[x0,#0] // store output 1475 add x28,x28,#1 // increment counter 1476 mov w5,w22 // unpack key block 1477 lsr x6,x22,#32 1478 stp x9,x11,[x0,#16] 1479 mov w7,w23 1480 lsr x8,x23,#32 1481 stp x13,x15,[x0,#32] 1482 mov w9,w24 1483 lsr x10,x24,#32 1484 stp x17,x20,[x0,#48] 1485 add x0,x0,#64 1486 mov w11,w25 1487 lsr x12,x25,#32 1488 mov w13,w26 1489 lsr x14,x26,#32 1490 mov w15,w27 1491 lsr x16,x27,#32 1492 mov w17,w28 1493 lsr x19,x28,#32 1494 mov w20,w30 1495 lsr x21,x30,#32 1496 1497 mov x4,#5 1498 .Loop_lower_neon: 1499 sub x4,x4,#1 1500 add v8.4s,v8.4s,v9.4s 1501 add w5,w5,w9 1502 add v12.4s,v12.4s,v13.4s 1503 add w6,w6,w10 1504 add v16.4s,v16.4s,v17.4s 1505 add w7,w7,w11 1506 add v20.4s,v20.4s,v21.4s 1507 add w8,w8,w12 1508 add v24.4s,v24.4s,v25.4s 1509 eor w17,w17,w5 1510 add v28.4s,v28.4s,v29.4s 1511 eor w19,w19,w6 1512 eor v11.16b,v11.16b,v8.16b 1513 eor w20,w20,w7 1514 eor v15.16b,v15.16b,v12.16b 1515 eor w21,w21,w8 1516 eor v19.16b,v19.16b,v16.16b 1517 ror w17,w17,#16 1518 eor v23.16b,v23.16b,v20.16b 1519 ror w19,w19,#16 1520 eor v27.16b,v27.16b,v24.16b 1521 ror w20,w20,#16 1522 eor v31.16b,v31.16b,v28.16b 1523 ror w21,w21,#16 1524 rev32 v11.8h,v11.8h 1525 add w13,w13,w17 1526 rev32 v15.8h,v15.8h 1527 add w14,w14,w19 1528 rev32 v19.8h,v19.8h 1529 add w15,w15,w20 1530 rev32 v23.8h,v23.8h 1531 add w16,w16,w21 1532 rev32 v27.8h,v27.8h 1533 eor w9,w9,w13 1534 rev32 v31.8h,v31.8h 1535 eor w10,w10,w14 1536 add v10.4s,v10.4s,v11.4s 1537 eor w11,w11,w15 1538 add v14.4s,v14.4s,v15.4s 1539 eor w12,w12,w16 1540 add v18.4s,v18.4s,v19.4s 1541 ror w9,w9,#20 1542 add v22.4s,v22.4s,v23.4s 1543 ror w10,w10,#20 1544 add v26.4s,v26.4s,v27.4s 1545 ror w11,w11,#20 1546 add v30.4s,v30.4s,v31.4s 1547 ror w12,w12,#20 1548 eor v0.16b,v9.16b,v10.16b 1549 add w5,w5,w9 1550 eor v1.16b,v13.16b,v14.16b 1551 add w6,w6,w10 1552 eor v2.16b,v17.16b,v18.16b 1553 add w7,w7,w11 1554 eor v3.16b,v21.16b,v22.16b 1555 add w8,w8,w12 1556 eor v4.16b,v25.16b,v26.16b 1557 eor w17,w17,w5 1558 eor v5.16b,v29.16b,v30.16b 1559 eor w19,w19,w6 1560 ushr v9.4s,v0.4s,#20 1561 eor w20,w20,w7 1562 ushr v13.4s,v1.4s,#20 1563 eor w21,w21,w8 1564 ushr v17.4s,v2.4s,#20 1565 ror w17,w17,#24 1566 ushr v21.4s,v3.4s,#20 1567 ror w19,w19,#24 1568 ushr v25.4s,v4.4s,#20 1569 ror w20,w20,#24 1570 ushr v29.4s,v5.4s,#20 1571 ror w21,w21,#24 1572 sli v9.4s,v0.4s,#12 1573 add w13,w13,w17 1574 sli v13.4s,v1.4s,#12 1575 add w14,w14,w19 1576 sli v17.4s,v2.4s,#12 1577 add w15,w15,w20 1578 sli v21.4s,v3.4s,#12 1579 add w16,w16,w21 1580 sli v25.4s,v4.4s,#12 1581 eor w9,w9,w13 1582 sli v29.4s,v5.4s,#12 1583 eor w10,w10,w14 1584 add v8.4s,v8.4s,v9.4s 1585 eor w11,w11,w15 1586 add v12.4s,v12.4s,v13.4s 1587 eor w12,w12,w16 1588 add v16.4s,v16.4s,v17.4s 1589 ror w9,w9,#25 1590 add v20.4s,v20.4s,v21.4s 1591 ror w10,w10,#25 1592 add v24.4s,v24.4s,v25.4s 1593 ror w11,w11,#25 1594 add v28.4s,v28.4s,v29.4s 1595 ror w12,w12,#25 1596 eor v11.16b,v11.16b,v8.16b 1597 add w5,w5,w10 1598 eor v15.16b,v15.16b,v12.16b 1599 add w6,w6,w11 1600 eor v19.16b,v19.16b,v16.16b 1601 add w7,w7,w12 1602 eor v23.16b,v23.16b,v20.16b 1603 add w8,w8,w9 1604 eor v27.16b,v27.16b,v24.16b 1605 eor w21,w21,w5 1606 eor v31.16b,v31.16b,v28.16b 1607 eor w17,w17,w6 1608 tbl v11.16b,{v11.16b},v6.16b 1609 eor w19,w19,w7 1610 tbl v15.16b,{v15.16b},v6.16b 1611 eor w20,w20,w8 1612 tbl v19.16b,{v19.16b},v6.16b 1613 ror w21,w21,#16 1614 tbl v23.16b,{v23.16b},v6.16b 1615 ror w17,w17,#16 1616 tbl v27.16b,{v27.16b},v6.16b 1617 ror w19,w19,#16 1618 tbl v31.16b,{v31.16b},v6.16b 1619 ror w20,w20,#16 1620 add v10.4s,v10.4s,v11.4s 1621 add w15,w15,w21 1622 add v14.4s,v14.4s,v15.4s 1623 add w16,w16,w17 1624 add v18.4s,v18.4s,v19.4s 1625 add w13,w13,w19 1626 add v22.4s,v22.4s,v23.4s 1627 add w14,w14,w20 1628 add v26.4s,v26.4s,v27.4s 1629 eor w10,w10,w15 1630 add v30.4s,v30.4s,v31.4s 1631 eor w11,w11,w16 1632 eor v0.16b,v9.16b,v10.16b 1633 eor w12,w12,w13 1634 eor v1.16b,v13.16b,v14.16b 1635 eor w9,w9,w14 1636 eor v2.16b,v17.16b,v18.16b 1637 ror w10,w10,#20 1638 eor v3.16b,v21.16b,v22.16b 1639 ror w11,w11,#20 1640 eor v4.16b,v25.16b,v26.16b 1641 ror w12,w12,#20 1642 eor v5.16b,v29.16b,v30.16b 1643 ror w9,w9,#20 1644 ushr v9.4s,v0.4s,#25 1645 add w5,w5,w10 1646 ushr v13.4s,v1.4s,#25 1647 add w6,w6,w11 1648 ushr v17.4s,v2.4s,#25 1649 add w7,w7,w12 1650 ushr v21.4s,v3.4s,#25 1651 add w8,w8,w9 1652 ushr v25.4s,v4.4s,#25 1653 eor w21,w21,w5 1654 ushr v29.4s,v5.4s,#25 1655 eor w17,w17,w6 1656 sli v9.4s,v0.4s,#7 1657 eor w19,w19,w7 1658 sli v13.4s,v1.4s,#7 1659 eor w20,w20,w8 1660 sli v17.4s,v2.4s,#7 1661 ror w21,w21,#24 1662 sli v21.4s,v3.4s,#7 1663 ror w17,w17,#24 1664 sli v25.4s,v4.4s,#7 1665 ror w19,w19,#24 1666 sli v29.4s,v5.4s,#7 1667 ror w20,w20,#24 1668 ext v10.16b,v10.16b,v10.16b,#8 1669 add w15,w15,w21 1670 ext v14.16b,v14.16b,v14.16b,#8 1671 add w16,w16,w17 1672 ext v18.16b,v18.16b,v18.16b,#8 1673 add w13,w13,w19 1674 ext v22.16b,v22.16b,v22.16b,#8 1675 add w14,w14,w20 1676 ext v26.16b,v26.16b,v26.16b,#8 1677 eor w10,w10,w15 1678 ext v30.16b,v30.16b,v30.16b,#8 1679 eor w11,w11,w16 1680 ext v11.16b,v11.16b,v11.16b,#12 1681 eor w12,w12,w13 1682 ext v15.16b,v15.16b,v15.16b,#12 1683 eor w9,w9,w14 1684 ext v19.16b,v19.16b,v19.16b,#12 1685 ror w10,w10,#25 1686 ext v23.16b,v23.16b,v23.16b,#12 1687 ror w11,w11,#25 1688 ext v27.16b,v27.16b,v27.16b,#12 1689 ror w12,w12,#25 1690 ext v31.16b,v31.16b,v31.16b,#12 1691 ror w9,w9,#25 1692 ext v9.16b,v9.16b,v9.16b,#4 1693 ext v13.16b,v13.16b,v13.16b,#4 1694 ext v17.16b,v17.16b,v17.16b,#4 1695 ext v21.16b,v21.16b,v21.16b,#4 1696 ext v25.16b,v25.16b,v25.16b,#4 1697 ext v29.16b,v29.16b,v29.16b,#4 1698 add v8.4s,v8.4s,v9.4s 1699 add w5,w5,w9 1700 add v12.4s,v12.4s,v13.4s 1701 add w6,w6,w10 1702 add v16.4s,v16.4s,v17.4s 1703 add w7,w7,w11 1704 add v20.4s,v20.4s,v21.4s 1705 add w8,w8,w12 1706 add v24.4s,v24.4s,v25.4s 1707 eor w17,w17,w5 1708 add v28.4s,v28.4s,v29.4s 1709 eor w19,w19,w6 1710 eor v11.16b,v11.16b,v8.16b 1711 eor w20,w20,w7 1712 eor v15.16b,v15.16b,v12.16b 1713 eor w21,w21,w8 1714 eor v19.16b,v19.16b,v16.16b 1715 ror w17,w17,#16 1716 eor v23.16b,v23.16b,v20.16b 1717 ror w19,w19,#16 1718 eor v27.16b,v27.16b,v24.16b 1719 ror w20,w20,#16 1720 eor v31.16b,v31.16b,v28.16b 1721 ror w21,w21,#16 1722 rev32 v11.8h,v11.8h 1723 add w13,w13,w17 1724 rev32 v15.8h,v15.8h 1725 add w14,w14,w19 1726 rev32 v19.8h,v19.8h 1727 add w15,w15,w20 1728 rev32 v23.8h,v23.8h 1729 add w16,w16,w21 1730 rev32 v27.8h,v27.8h 1731 eor w9,w9,w13 1732 rev32 v31.8h,v31.8h 1733 eor w10,w10,w14 1734 add v10.4s,v10.4s,v11.4s 1735 eor w11,w11,w15 1736 add v14.4s,v14.4s,v15.4s 1737 eor w12,w12,w16 1738 add v18.4s,v18.4s,v19.4s 1739 ror w9,w9,#20 1740 add v22.4s,v22.4s,v23.4s 1741 ror w10,w10,#20 1742 add v26.4s,v26.4s,v27.4s 1743 ror w11,w11,#20 1744 add v30.4s,v30.4s,v31.4s 1745 ror w12,w12,#20 1746 eor v0.16b,v9.16b,v10.16b 1747 add w5,w5,w9 1748 eor v1.16b,v13.16b,v14.16b 1749 add w6,w6,w10 1750 eor v2.16b,v17.16b,v18.16b 1751 add w7,w7,w11 1752 eor v3.16b,v21.16b,v22.16b 1753 add w8,w8,w12 1754 eor v4.16b,v25.16b,v26.16b 1755 eor w17,w17,w5 1756 eor v5.16b,v29.16b,v30.16b 1757 eor w19,w19,w6 1758 ushr v9.4s,v0.4s,#20 1759 eor w20,w20,w7 1760 ushr v13.4s,v1.4s,#20 1761 eor w21,w21,w8 1762 ushr v17.4s,v2.4s,#20 1763 ror w17,w17,#24 1764 ushr v21.4s,v3.4s,#20 1765 ror w19,w19,#24 1766 ushr v25.4s,v4.4s,#20 1767 ror w20,w20,#24 1768 ushr v29.4s,v5.4s,#20 1769 ror w21,w21,#24 1770 sli v9.4s,v0.4s,#12 1771 add w13,w13,w17 1772 sli v13.4s,v1.4s,#12 1773 add w14,w14,w19 1774 sli v17.4s,v2.4s,#12 1775 add w15,w15,w20 1776 sli v21.4s,v3.4s,#12 1777 add w16,w16,w21 1778 sli v25.4s,v4.4s,#12 1779 eor w9,w9,w13 1780 sli v29.4s,v5.4s,#12 1781 eor w10,w10,w14 1782 add v8.4s,v8.4s,v9.4s 1783 eor w11,w11,w15 1784 add v12.4s,v12.4s,v13.4s 1785 eor w12,w12,w16 1786 add v16.4s,v16.4s,v17.4s 1787 ror w9,w9,#25 1788 add v20.4s,v20.4s,v21.4s 1789 ror w10,w10,#25 1790 add v24.4s,v24.4s,v25.4s 1791 ror w11,w11,#25 1792 add v28.4s,v28.4s,v29.4s 1793 ror w12,w12,#25 1794 eor v11.16b,v11.16b,v8.16b 1795 add w5,w5,w10 1796 eor v15.16b,v15.16b,v12.16b 1797 add w6,w6,w11 1798 eor v19.16b,v19.16b,v16.16b 1799 add w7,w7,w12 1800 eor v23.16b,v23.16b,v20.16b 1801 add w8,w8,w9 1802 eor v27.16b,v27.16b,v24.16b 1803 eor w21,w21,w5 1804 eor v31.16b,v31.16b,v28.16b 1805 eor w17,w17,w6 1806 tbl v11.16b,{v11.16b},v6.16b 1807 eor w19,w19,w7 1808 tbl v15.16b,{v15.16b},v6.16b 1809 eor w20,w20,w8 1810 tbl v19.16b,{v19.16b},v6.16b 1811 ror w21,w21,#16 1812 tbl v23.16b,{v23.16b},v6.16b 1813 ror w17,w17,#16 1814 tbl v27.16b,{v27.16b},v6.16b 1815 ror w19,w19,#16 1816 tbl v31.16b,{v31.16b},v6.16b 1817 ror w20,w20,#16 1818 add v10.4s,v10.4s,v11.4s 1819 add w15,w15,w21 1820 add v14.4s,v14.4s,v15.4s 1821 add w16,w16,w17 1822 add v18.4s,v18.4s,v19.4s 1823 add w13,w13,w19 1824 add v22.4s,v22.4s,v23.4s 1825 add w14,w14,w20 1826 add v26.4s,v26.4s,v27.4s 1827 eor w10,w10,w15 1828 add v30.4s,v30.4s,v31.4s 1829 eor w11,w11,w16 1830 eor v0.16b,v9.16b,v10.16b 1831 eor w12,w12,w13 1832 eor v1.16b,v13.16b,v14.16b 1833 eor w9,w9,w14 1834 eor v2.16b,v17.16b,v18.16b 1835 ror w10,w10,#20 1836 eor v3.16b,v21.16b,v22.16b 1837 ror w11,w11,#20 1838 eor v4.16b,v25.16b,v26.16b 1839 ror w12,w12,#20 1840 eor v5.16b,v29.16b,v30.16b 1841 ror w9,w9,#20 1842 ushr v9.4s,v0.4s,#25 1843 add w5,w5,w10 1844 ushr v13.4s,v1.4s,#25 1845 add w6,w6,w11 1846 ushr v17.4s,v2.4s,#25 1847 add w7,w7,w12 1848 ushr v21.4s,v3.4s,#25 1849 add w8,w8,w9 1850 ushr v25.4s,v4.4s,#25 1851 eor w21,w21,w5 1852 ushr v29.4s,v5.4s,#25 1853 eor w17,w17,w6 1854 sli v9.4s,v0.4s,#7 1855 eor w19,w19,w7 1856 sli v13.4s,v1.4s,#7 1857 eor w20,w20,w8 1858 sli v17.4s,v2.4s,#7 1859 ror w21,w21,#24 1860 sli v21.4s,v3.4s,#7 1861 ror w17,w17,#24 1862 sli v25.4s,v4.4s,#7 1863 ror w19,w19,#24 1864 sli v29.4s,v5.4s,#7 1865 ror w20,w20,#24 1866 ext v10.16b,v10.16b,v10.16b,#8 1867 add w15,w15,w21 1868 ext v14.16b,v14.16b,v14.16b,#8 1869 add w16,w16,w17 1870 ext v18.16b,v18.16b,v18.16b,#8 1871 add w13,w13,w19 1872 ext v22.16b,v22.16b,v22.16b,#8 1873 add w14,w14,w20 1874 ext v26.16b,v26.16b,v26.16b,#8 1875 eor w10,w10,w15 1876 ext v30.16b,v30.16b,v30.16b,#8 1877 eor w11,w11,w16 1878 ext v11.16b,v11.16b,v11.16b,#4 1879 eor w12,w12,w13 1880 ext v15.16b,v15.16b,v15.16b,#4 1881 eor w9,w9,w14 1882 ext v19.16b,v19.16b,v19.16b,#4 1883 ror w10,w10,#25 1884 ext v23.16b,v23.16b,v23.16b,#4 1885 ror w11,w11,#25 1886 ext v27.16b,v27.16b,v27.16b,#4 1887 ror w12,w12,#25 1888 ext v31.16b,v31.16b,v31.16b,#4 1889 ror w9,w9,#25 1890 ext v9.16b,v9.16b,v9.16b,#12 1891 ext v13.16b,v13.16b,v13.16b,#12 1892 ext v17.16b,v17.16b,v17.16b,#12 1893 ext v21.16b,v21.16b,v21.16b,#12 1894 ext v25.16b,v25.16b,v25.16b,#12 1895 ext v29.16b,v29.16b,v29.16b,#12 1896 cbnz x4,.Loop_lower_neon 1897 1898 add w5,w5,w22 // accumulate key block 1899 ldp q0,q1,[sp,#0] 1900 add x6,x6,x22,lsr#32 1901 ldp q2,q3,[sp,#32] 1902 add w7,w7,w23 1903 ldp q4,q5,[sp,#64] 1904 add x8,x8,x23,lsr#32 1905 ldr q6,[sp,#96] 1906 add v8.4s,v8.4s,v0.4s 1907 add w9,w9,w24 1908 add v12.4s,v12.4s,v0.4s 1909 add x10,x10,x24,lsr#32 1910 add v16.4s,v16.4s,v0.4s 1911 add w11,w11,w25 1912 add v20.4s,v20.4s,v0.4s 1913 add x12,x12,x25,lsr#32 1914 add v24.4s,v24.4s,v0.4s 1915 add w13,w13,w26 1916 add v28.4s,v28.4s,v0.4s 1917 add x14,x14,x26,lsr#32 1918 add v10.4s,v10.4s,v2.4s 1919 add w15,w15,w27 1920 add v14.4s,v14.4s,v2.4s 1921 add x16,x16,x27,lsr#32 1922 add v18.4s,v18.4s,v2.4s 1923 add w17,w17,w28 1924 add v22.4s,v22.4s,v2.4s 1925 add x19,x19,x28,lsr#32 1926 add v26.4s,v26.4s,v2.4s 1927 add w20,w20,w30 1928 add v30.4s,v30.4s,v2.4s 1929 add x21,x21,x30,lsr#32 1930 add v27.4s,v27.4s,v7.4s // +4 1931 add x5,x5,x6,lsl#32 // pack 1932 add v31.4s,v31.4s,v7.4s // +4 1933 add x7,x7,x8,lsl#32 1934 add v11.4s,v11.4s,v3.4s 1935 ldp x6,x8,[x1,#0] // load input 1936 add v15.4s,v15.4s,v4.4s 1937 add x9,x9,x10,lsl#32 1938 add v19.4s,v19.4s,v5.4s 1939 add x11,x11,x12,lsl#32 1940 add v23.4s,v23.4s,v6.4s 1941 ldp x10,x12,[x1,#16] 1942 add v27.4s,v27.4s,v3.4s 1943 add x13,x13,x14,lsl#32 1944 add v31.4s,v31.4s,v4.4s 1945 add x15,x15,x16,lsl#32 1946 add v9.4s,v9.4s,v1.4s 1947 ldp x14,x16,[x1,#32] 1948 add v13.4s,v13.4s,v1.4s 1949 add x17,x17,x19,lsl#32 1950 add v17.4s,v17.4s,v1.4s 1951 add x20,x20,x21,lsl#32 1952 add v21.4s,v21.4s,v1.4s 1953 ldp x19,x21,[x1,#48] 1954 add v25.4s,v25.4s,v1.4s 1955 add x1,x1,#64 1956 add v29.4s,v29.4s,v1.4s 1957 1958 #ifdef __AARCH64EB__ 1959 rev x5,x5 1960 rev x7,x7 1961 rev x9,x9 1962 rev x11,x11 1963 rev x13,x13 1964 rev x15,x15 1965 rev x17,x17 1966 rev x20,x20 1967 #endif 1968 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1969 eor x5,x5,x6 1970 eor x7,x7,x8 1971 eor x9,x9,x10 1972 eor x11,x11,x12 1973 eor x13,x13,x14 1974 eor v8.16b,v8.16b,v0.16b 1975 eor x15,x15,x16 1976 eor v9.16b,v9.16b,v1.16b 1977 eor x17,x17,x19 1978 eor v10.16b,v10.16b,v2.16b 1979 eor x20,x20,x21 1980 eor v11.16b,v11.16b,v3.16b 1981 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1982 1983 stp x5,x7,[x0,#0] // store output 1984 add x28,x28,#7 // increment counter 1985 stp x9,x11,[x0,#16] 1986 stp x13,x15,[x0,#32] 1987 stp x17,x20,[x0,#48] 1988 add x0,x0,#64 1989 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1990 1991 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1992 eor v12.16b,v12.16b,v0.16b 1993 eor v13.16b,v13.16b,v1.16b 1994 eor v14.16b,v14.16b,v2.16b 1995 eor v15.16b,v15.16b,v3.16b 1996 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1997 1998 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1999 eor v16.16b,v16.16b,v8.16b 2000 ldp q0,q1,[sp,#0] 2001 eor v17.16b,v17.16b,v9.16b 2002 ldp q2,q3,[sp,#32] 2003 eor v18.16b,v18.16b,v10.16b 2004 eor v19.16b,v19.16b,v11.16b 2005 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 2006 2007 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 2008 eor v20.16b,v20.16b,v12.16b 2009 eor v21.16b,v21.16b,v13.16b 2010 eor v22.16b,v22.16b,v14.16b 2011 eor v23.16b,v23.16b,v15.16b 2012 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 2013 2014 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 2015 eor v24.16b,v24.16b,v16.16b 2016 eor v25.16b,v25.16b,v17.16b 2017 eor v26.16b,v26.16b,v18.16b 2018 eor v27.16b,v27.16b,v19.16b 2019 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 2020 2021 shl v8.4s,v7.4s,#1 // 4 -> 8 2022 eor v28.16b,v28.16b,v20.16b 2023 eor v29.16b,v29.16b,v21.16b 2024 eor v30.16b,v30.16b,v22.16b 2025 eor v31.16b,v31.16b,v23.16b 2026 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 2027 2028 add v3.4s,v3.4s,v8.4s // += 8 2029 add v4.4s,v4.4s,v8.4s 2030 add v5.4s,v5.4s,v8.4s 2031 add v6.4s,v6.4s,v8.4s 2032 2033 b.hs .Loop_outer_512_neon 2034 2035 adds x2,x2,#512 2036 ushr v7.4s,v7.4s,#1 // 4 -> 2 2037 2038 ldp d10,d11,[sp,#128+16] // meet ABI requirements 2039 ldp d12,d13,[sp,#128+32] 2040 ldp d14,d15,[sp,#128+48] 2041 2042 stp q0,q0,[sp,#0] // wipe off-load area 2043 stp q0,q0,[sp,#32] 2044 stp q0,q0,[sp,#64] 2045 2046 b.eq .Ldone_512_neon 2047 2048 sub x3,x3,#16 // .Lone 2049 cmp x2,#192 2050 add sp,sp,#128 2051 sub v3.4s,v3.4s,v7.4s // -= 2 2052 ld1 {v8.4s,v9.4s},[x3] 2053 b.hs .Loop_outer_neon 2054 2055 ldp d8,d9,[sp,#0] // meet ABI requirements 2056 eor v1.16b,v1.16b,v1.16b 2057 eor v2.16b,v2.16b,v2.16b 2058 eor v3.16b,v3.16b,v3.16b 2059 eor v4.16b,v4.16b,v4.16b 2060 eor v5.16b,v5.16b,v5.16b 2061 eor v6.16b,v6.16b,v6.16b 2062 b .Loop_outer 2063 2064 .Ldone_512_neon: 2065 ldp d8,d9,[sp,#128+0] // meet ABI requirements 2066 ldp x19,x20,[x29,#16] 2067 add sp,sp,#128+64 2068 ldp x21,x22,[x29,#32] 2069 ldp x23,x24,[x29,#48] 2070 ldp x25,x26,[x29,#64] 2071 ldp x27,x28,[x29,#80] 2072 ldp x29,x30,[sp],#96 2073 AARCH64_VALIDATE_LINK_REGISTER 2074 ret 2075 .size ChaCha20_512_neon,.-ChaCha20_512_neon 2076