1 #include "arm_arch.h" 2 3 #if __ARM_MAX_ARCH__>=7 4 .arch armv8-a+crypto 5 .text 6 .section .rodata 7 .align 5 8 .Lrcon: 9 .long 0x01,0x01,0x01,0x01 10 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 11 .long 0x1b,0x1b,0x1b,0x1b 12 .previous 13 .globl aes_v8_set_encrypt_key 14 .type aes_v8_set_encrypt_key,%function 15 .align 5 16 aes_v8_set_encrypt_key: 17 .Lenc_key: 18 AARCH64_VALID_CALL_TARGET 19 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 20 stp x29,x30,[sp,#-16]! 21 add x29,sp,#0 22 mov x3,#-1 23 cmp x0,#0 24 b.eq .Lenc_key_abort 25 cmp x2,#0 26 b.eq .Lenc_key_abort 27 mov x3,#-2 28 cmp w1,#128 29 b.lt .Lenc_key_abort 30 cmp w1,#256 31 b.gt .Lenc_key_abort 32 tst w1,#0x3f 33 b.ne .Lenc_key_abort 34 35 adrp x3,.Lrcon 36 add x3,x3,#:lo12:.Lrcon 37 cmp w1,#192 38 39 eor v0.16b,v0.16b,v0.16b 40 ld1 {v3.16b},[x0],#16 41 mov w1,#8 // reuse w1 42 ld1 {v1.4s,v2.4s},[x3],#32 43 44 b.lt .Loop128 45 b.eq .L192 46 b .L256 47 48 .align 4 49 .Loop128: 50 tbl v6.16b,{v3.16b},v2.16b 51 ext v5.16b,v0.16b,v3.16b,#12 52 st1 {v3.4s},[x2],#16 53 aese v6.16b,v0.16b 54 subs w1,w1,#1 55 56 eor v3.16b,v3.16b,v5.16b 57 ext v5.16b,v0.16b,v5.16b,#12 58 eor v3.16b,v3.16b,v5.16b 59 ext v5.16b,v0.16b,v5.16b,#12 60 eor v6.16b,v6.16b,v1.16b 61 eor v3.16b,v3.16b,v5.16b 62 shl v1.16b,v1.16b,#1 63 eor v3.16b,v3.16b,v6.16b 64 b.ne .Loop128 65 66 ld1 {v1.4s},[x3] 67 68 tbl v6.16b,{v3.16b},v2.16b 69 ext v5.16b,v0.16b,v3.16b,#12 70 st1 {v3.4s},[x2],#16 71 aese v6.16b,v0.16b 72 73 eor v3.16b,v3.16b,v5.16b 74 ext v5.16b,v0.16b,v5.16b,#12 75 eor v3.16b,v3.16b,v5.16b 76 ext v5.16b,v0.16b,v5.16b,#12 77 eor v6.16b,v6.16b,v1.16b 78 eor v3.16b,v3.16b,v5.16b 79 shl v1.16b,v1.16b,#1 80 eor v3.16b,v3.16b,v6.16b 81 82 tbl v6.16b,{v3.16b},v2.16b 83 ext v5.16b,v0.16b,v3.16b,#12 84 st1 {v3.4s},[x2],#16 85 aese v6.16b,v0.16b 86 87 eor v3.16b,v3.16b,v5.16b 88 ext v5.16b,v0.16b,v5.16b,#12 89 eor v3.16b,v3.16b,v5.16b 90 ext v5.16b,v0.16b,v5.16b,#12 91 eor v6.16b,v6.16b,v1.16b 92 eor v3.16b,v3.16b,v5.16b 93 eor v3.16b,v3.16b,v6.16b 94 st1 {v3.4s},[x2] 95 add x2,x2,#0x50 96 97 mov w12,#10 98 b .Ldone 99 100 .align 4 101 .L192: 102 ld1 {v4.8b},[x0],#8 103 movi v6.16b,#8 // borrow v6.16b 104 st1 {v3.4s},[x2],#16 105 sub v2.16b,v2.16b,v6.16b // adjust the mask 106 107 .Loop192: 108 tbl v6.16b,{v4.16b},v2.16b 109 ext v5.16b,v0.16b,v3.16b,#12 110 #ifdef __AARCH64EB__ 111 st1 {v4.4s},[x2],#16 112 sub x2,x2,#8 113 #else 114 st1 {v4.8b},[x2],#8 115 #endif 116 aese v6.16b,v0.16b 117 subs w1,w1,#1 118 119 eor v3.16b,v3.16b,v5.16b 120 ext v5.16b,v0.16b,v5.16b,#12 121 eor v3.16b,v3.16b,v5.16b 122 ext v5.16b,v0.16b,v5.16b,#12 123 eor v3.16b,v3.16b,v5.16b 124 125 dup v5.4s,v3.s[3] 126 eor v5.16b,v5.16b,v4.16b 127 eor v6.16b,v6.16b,v1.16b 128 ext v4.16b,v0.16b,v4.16b,#12 129 shl v1.16b,v1.16b,#1 130 eor v4.16b,v4.16b,v5.16b 131 eor v3.16b,v3.16b,v6.16b 132 eor v4.16b,v4.16b,v6.16b 133 st1 {v3.4s},[x2],#16 134 b.ne .Loop192 135 136 mov w12,#12 137 add x2,x2,#0x20 138 b .Ldone 139 140 .align 4 141 .L256: 142 ld1 {v4.16b},[x0] 143 mov w1,#7 144 mov w12,#14 145 st1 {v3.4s},[x2],#16 146 147 .Loop256: 148 tbl v6.16b,{v4.16b},v2.16b 149 ext v5.16b,v0.16b,v3.16b,#12 150 st1 {v4.4s},[x2],#16 151 aese v6.16b,v0.16b 152 subs w1,w1,#1 153 154 eor v3.16b,v3.16b,v5.16b 155 ext v5.16b,v0.16b,v5.16b,#12 156 eor v3.16b,v3.16b,v5.16b 157 ext v5.16b,v0.16b,v5.16b,#12 158 eor v6.16b,v6.16b,v1.16b 159 eor v3.16b,v3.16b,v5.16b 160 shl v1.16b,v1.16b,#1 161 eor v3.16b,v3.16b,v6.16b 162 st1 {v3.4s},[x2],#16 163 b.eq .Ldone 164 165 dup v6.4s,v3.s[3] // just splat 166 ext v5.16b,v0.16b,v4.16b,#12 167 aese v6.16b,v0.16b 168 169 eor v4.16b,v4.16b,v5.16b 170 ext v5.16b,v0.16b,v5.16b,#12 171 eor v4.16b,v4.16b,v5.16b 172 ext v5.16b,v0.16b,v5.16b,#12 173 eor v4.16b,v4.16b,v5.16b 174 175 eor v4.16b,v4.16b,v6.16b 176 b .Loop256 177 178 .Ldone: 179 str w12,[x2] 180 mov x3,#0 181 182 .Lenc_key_abort: 183 mov x0,x3 // return value 184 ldr x29,[sp],#16 185 ret 186 .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key 187 188 .globl aes_v8_set_decrypt_key 189 .type aes_v8_set_decrypt_key,%function 190 .align 5 191 aes_v8_set_decrypt_key: 192 AARCH64_SIGN_LINK_REGISTER 193 stp x29,x30,[sp,#-16]! 194 add x29,sp,#0 195 bl .Lenc_key 196 197 cmp x0,#0 198 b.ne .Ldec_key_abort 199 200 sub x2,x2,#240 // restore original x2 201 mov x4,#-16 202 add x0,x2,x12,lsl#4 // end of key schedule 203 204 ld1 {v0.4s},[x2] 205 ld1 {v1.4s},[x0] 206 st1 {v0.4s},[x0],x4 207 st1 {v1.4s},[x2],#16 208 209 .Loop_imc: 210 ld1 {v0.4s},[x2] 211 ld1 {v1.4s},[x0] 212 aesimc v0.16b,v0.16b 213 aesimc v1.16b,v1.16b 214 st1 {v0.4s},[x0],x4 215 st1 {v1.4s},[x2],#16 216 cmp x0,x2 217 b.hi .Loop_imc 218 219 ld1 {v0.4s},[x2] 220 aesimc v0.16b,v0.16b 221 st1 {v0.4s},[x0] 222 223 eor x0,x0,x0 // return value 224 .Ldec_key_abort: 225 ldp x29,x30,[sp],#16 226 AARCH64_VALIDATE_LINK_REGISTER 227 ret 228 .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key 229 .globl aes_v8_encrypt 230 .type aes_v8_encrypt,%function 231 .align 5 232 aes_v8_encrypt: 233 AARCH64_VALID_CALL_TARGET 234 ldr w3,[x2,#240] 235 ld1 {v0.4s},[x2],#16 236 ld1 {v2.16b},[x0] 237 sub w3,w3,#2 238 ld1 {v1.4s},[x2],#16 239 240 .Loop_enc: 241 aese v2.16b,v0.16b 242 aesmc v2.16b,v2.16b 243 ld1 {v0.4s},[x2],#16 244 subs w3,w3,#2 245 aese v2.16b,v1.16b 246 aesmc v2.16b,v2.16b 247 ld1 {v1.4s},[x2],#16 248 b.gt .Loop_enc 249 250 aese v2.16b,v0.16b 251 aesmc v2.16b,v2.16b 252 ld1 {v0.4s},[x2] 253 aese v2.16b,v1.16b 254 eor v2.16b,v2.16b,v0.16b 255 256 st1 {v2.16b},[x1] 257 ret 258 .size aes_v8_encrypt,.-aes_v8_encrypt 259 .globl aes_v8_decrypt 260 .type aes_v8_decrypt,%function 261 .align 5 262 aes_v8_decrypt: 263 AARCH64_VALID_CALL_TARGET 264 ldr w3,[x2,#240] 265 ld1 {v0.4s},[x2],#16 266 ld1 {v2.16b},[x0] 267 sub w3,w3,#2 268 ld1 {v1.4s},[x2],#16 269 270 .Loop_dec: 271 aesd v2.16b,v0.16b 272 aesimc v2.16b,v2.16b 273 ld1 {v0.4s},[x2],#16 274 subs w3,w3,#2 275 aesd v2.16b,v1.16b 276 aesimc v2.16b,v2.16b 277 ld1 {v1.4s},[x2],#16 278 b.gt .Loop_dec 279 280 aesd v2.16b,v0.16b 281 aesimc v2.16b,v2.16b 282 ld1 {v0.4s},[x2] 283 aesd v2.16b,v1.16b 284 eor v2.16b,v2.16b,v0.16b 285 286 st1 {v2.16b},[x1] 287 ret 288 .size aes_v8_decrypt,.-aes_v8_decrypt 289 .globl aes_v8_ecb_encrypt 290 .type aes_v8_ecb_encrypt,%function 291 .align 5 292 aes_v8_ecb_encrypt: 293 AARCH64_VALID_CALL_TARGET 294 subs x2,x2,#16 295 // Original input data size bigger than 16, jump to big size processing. 296 b.ne .Lecb_big_size 297 ld1 {v0.16b},[x0] 298 cmp w4,#0 // en- or decrypting? 299 ldr w5,[x3,#240] 300 ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule... 301 302 b.eq .Lecb_small_dec 303 aese v0.16b,v5.16b 304 aesmc v0.16b,v0.16b 305 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 306 aese v0.16b,v6.16b 307 aesmc v0.16b,v0.16b 308 subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing 309 b.eq .Lecb_128_enc 310 .Lecb_round_loop: 311 aese v0.16b,v16.16b 312 aesmc v0.16b,v0.16b 313 ld1 {v16.4s},[x3],#16 // load key schedule... 314 aese v0.16b,v17.16b 315 aesmc v0.16b,v0.16b 316 ld1 {v17.4s},[x3],#16 // load key schedule... 317 subs w5,w5,#2 // bias 318 b.gt .Lecb_round_loop 319 .Lecb_128_enc: 320 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 321 aese v0.16b,v16.16b 322 aesmc v0.16b,v0.16b 323 aese v0.16b,v17.16b 324 aesmc v0.16b,v0.16b 325 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 326 aese v0.16b,v18.16b 327 aesmc v0.16b,v0.16b 328 aese v0.16b,v19.16b 329 aesmc v0.16b,v0.16b 330 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 331 aese v0.16b,v20.16b 332 aesmc v0.16b,v0.16b 333 aese v0.16b,v21.16b 334 aesmc v0.16b,v0.16b 335 ld1 {v7.4s},[x3] 336 aese v0.16b,v22.16b 337 aesmc v0.16b,v0.16b 338 aese v0.16b,v23.16b 339 eor v0.16b,v0.16b,v7.16b 340 st1 {v0.16b},[x1] 341 b .Lecb_Final_abort 342 .Lecb_small_dec: 343 aesd v0.16b,v5.16b 344 aesimc v0.16b,v0.16b 345 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 346 aesd v0.16b,v6.16b 347 aesimc v0.16b,v0.16b 348 subs w5,w5,#10 // bias 349 b.eq .Lecb_128_dec 350 .Lecb_dec_round_loop: 351 aesd v0.16b,v16.16b 352 aesimc v0.16b,v0.16b 353 ld1 {v16.4s},[x3],#16 // load key schedule... 354 aesd v0.16b,v17.16b 355 aesimc v0.16b,v0.16b 356 ld1 {v17.4s},[x3],#16 // load key schedule... 357 subs w5,w5,#2 // bias 358 b.gt .Lecb_dec_round_loop 359 .Lecb_128_dec: 360 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 361 aesd v0.16b,v16.16b 362 aesimc v0.16b,v0.16b 363 aesd v0.16b,v17.16b 364 aesimc v0.16b,v0.16b 365 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 366 aesd v0.16b,v18.16b 367 aesimc v0.16b,v0.16b 368 aesd v0.16b,v19.16b 369 aesimc v0.16b,v0.16b 370 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 371 aesd v0.16b,v20.16b 372 aesimc v0.16b,v0.16b 373 aesd v0.16b,v21.16b 374 aesimc v0.16b,v0.16b 375 ld1 {v7.4s},[x3] 376 aesd v0.16b,v22.16b 377 aesimc v0.16b,v0.16b 378 aesd v0.16b,v23.16b 379 eor v0.16b,v0.16b,v7.16b 380 st1 {v0.16b},[x1] 381 b .Lecb_Final_abort 382 .Lecb_big_size: 383 stp x29,x30,[sp,#-16]! 384 add x29,sp,#0 385 mov x8,#16 386 b.lo .Lecb_done 387 csel x8,xzr,x8,eq 388 389 cmp w4,#0 // en- or decrypting? 390 ldr w5,[x3,#240] 391 and x2,x2,#-16 392 ld1 {v0.16b},[x0],x8 393 394 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 395 sub w5,w5,#6 396 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 397 sub w5,w5,#2 398 ld1 {v18.4s,v19.4s},[x7],#32 399 ld1 {v20.4s,v21.4s},[x7],#32 400 ld1 {v22.4s,v23.4s},[x7],#32 401 ld1 {v7.4s},[x7] 402 403 add x7,x3,#32 404 mov w6,w5 405 b.eq .Lecb_dec 406 407 ld1 {v1.16b},[x0],#16 408 subs x2,x2,#32 // bias 409 add w6,w5,#2 410 orr v3.16b,v1.16b,v1.16b 411 orr v24.16b,v1.16b,v1.16b 412 orr v1.16b,v0.16b,v0.16b 413 b.lo .Lecb_enc_tail 414 415 orr v1.16b,v3.16b,v3.16b 416 ld1 {v24.16b},[x0],#16 417 cmp x2,#32 418 b.lo .Loop3x_ecb_enc 419 420 ld1 {v25.16b},[x0],#16 421 ld1 {v26.16b},[x0],#16 422 sub x2,x2,#32 // bias 423 mov w6,w5 424 425 .Loop5x_ecb_enc: 426 aese v0.16b,v16.16b 427 aesmc v0.16b,v0.16b 428 aese v1.16b,v16.16b 429 aesmc v1.16b,v1.16b 430 aese v24.16b,v16.16b 431 aesmc v24.16b,v24.16b 432 aese v25.16b,v16.16b 433 aesmc v25.16b,v25.16b 434 aese v26.16b,v16.16b 435 aesmc v26.16b,v26.16b 436 ld1 {v16.4s},[x7],#16 437 subs w6,w6,#2 438 aese v0.16b,v17.16b 439 aesmc v0.16b,v0.16b 440 aese v1.16b,v17.16b 441 aesmc v1.16b,v1.16b 442 aese v24.16b,v17.16b 443 aesmc v24.16b,v24.16b 444 aese v25.16b,v17.16b 445 aesmc v25.16b,v25.16b 446 aese v26.16b,v17.16b 447 aesmc v26.16b,v26.16b 448 ld1 {v17.4s},[x7],#16 449 b.gt .Loop5x_ecb_enc 450 451 aese v0.16b,v16.16b 452 aesmc v0.16b,v0.16b 453 aese v1.16b,v16.16b 454 aesmc v1.16b,v1.16b 455 aese v24.16b,v16.16b 456 aesmc v24.16b,v24.16b 457 aese v25.16b,v16.16b 458 aesmc v25.16b,v25.16b 459 aese v26.16b,v16.16b 460 aesmc v26.16b,v26.16b 461 cmp x2,#0x40 // because .Lecb_enc_tail4x 462 sub x2,x2,#0x50 463 464 aese v0.16b,v17.16b 465 aesmc v0.16b,v0.16b 466 aese v1.16b,v17.16b 467 aesmc v1.16b,v1.16b 468 aese v24.16b,v17.16b 469 aesmc v24.16b,v24.16b 470 aese v25.16b,v17.16b 471 aesmc v25.16b,v25.16b 472 aese v26.16b,v17.16b 473 aesmc v26.16b,v26.16b 474 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 475 mov x7,x3 476 477 aese v0.16b,v18.16b 478 aesmc v0.16b,v0.16b 479 aese v1.16b,v18.16b 480 aesmc v1.16b,v1.16b 481 aese v24.16b,v18.16b 482 aesmc v24.16b,v24.16b 483 aese v25.16b,v18.16b 484 aesmc v25.16b,v25.16b 485 aese v26.16b,v18.16b 486 aesmc v26.16b,v26.16b 487 add x0,x0,x6 // x0 is adjusted in such way that 488 // at exit from the loop v1.16b-v26.16b 489 // are loaded with last "words" 490 add x6,x2,#0x60 // because .Lecb_enc_tail4x 491 492 aese v0.16b,v19.16b 493 aesmc v0.16b,v0.16b 494 aese v1.16b,v19.16b 495 aesmc v1.16b,v1.16b 496 aese v24.16b,v19.16b 497 aesmc v24.16b,v24.16b 498 aese v25.16b,v19.16b 499 aesmc v25.16b,v25.16b 500 aese v26.16b,v19.16b 501 aesmc v26.16b,v26.16b 502 503 aese v0.16b,v20.16b 504 aesmc v0.16b,v0.16b 505 aese v1.16b,v20.16b 506 aesmc v1.16b,v1.16b 507 aese v24.16b,v20.16b 508 aesmc v24.16b,v24.16b 509 aese v25.16b,v20.16b 510 aesmc v25.16b,v25.16b 511 aese v26.16b,v20.16b 512 aesmc v26.16b,v26.16b 513 514 aese v0.16b,v21.16b 515 aesmc v0.16b,v0.16b 516 aese v1.16b,v21.16b 517 aesmc v1.16b,v1.16b 518 aese v24.16b,v21.16b 519 aesmc v24.16b,v24.16b 520 aese v25.16b,v21.16b 521 aesmc v25.16b,v25.16b 522 aese v26.16b,v21.16b 523 aesmc v26.16b,v26.16b 524 525 aese v0.16b,v22.16b 526 aesmc v0.16b,v0.16b 527 aese v1.16b,v22.16b 528 aesmc v1.16b,v1.16b 529 aese v24.16b,v22.16b 530 aesmc v24.16b,v24.16b 531 aese v25.16b,v22.16b 532 aesmc v25.16b,v25.16b 533 aese v26.16b,v22.16b 534 aesmc v26.16b,v26.16b 535 536 aese v0.16b,v23.16b 537 ld1 {v2.16b},[x0],#16 538 aese v1.16b,v23.16b 539 ld1 {v3.16b},[x0],#16 540 aese v24.16b,v23.16b 541 ld1 {v27.16b},[x0],#16 542 aese v25.16b,v23.16b 543 ld1 {v28.16b},[x0],#16 544 aese v26.16b,v23.16b 545 ld1 {v29.16b},[x0],#16 546 cbz x6,.Lecb_enc_tail4x 547 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 548 eor v4.16b,v7.16b,v0.16b 549 orr v0.16b,v2.16b,v2.16b 550 eor v5.16b,v7.16b,v1.16b 551 orr v1.16b,v3.16b,v3.16b 552 eor v17.16b,v7.16b,v24.16b 553 orr v24.16b,v27.16b,v27.16b 554 eor v30.16b,v7.16b,v25.16b 555 orr v25.16b,v28.16b,v28.16b 556 eor v31.16b,v7.16b,v26.16b 557 st1 {v4.16b},[x1],#16 558 orr v26.16b,v29.16b,v29.16b 559 st1 {v5.16b},[x1],#16 560 mov w6,w5 561 st1 {v17.16b},[x1],#16 562 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 563 st1 {v30.16b},[x1],#16 564 st1 {v31.16b},[x1],#16 565 b.hs .Loop5x_ecb_enc 566 567 add x2,x2,#0x50 568 cbz x2,.Lecb_done 569 570 add w6,w5,#2 571 subs x2,x2,#0x30 572 orr v0.16b,v27.16b,v27.16b 573 orr v1.16b,v28.16b,v28.16b 574 orr v24.16b,v29.16b,v29.16b 575 b.lo .Lecb_enc_tail 576 577 b .Loop3x_ecb_enc 578 579 .align 4 580 .Lecb_enc_tail4x: 581 eor v5.16b,v7.16b,v1.16b 582 eor v17.16b,v7.16b,v24.16b 583 eor v30.16b,v7.16b,v25.16b 584 eor v31.16b,v7.16b,v26.16b 585 st1 {v5.16b},[x1],#16 586 st1 {v17.16b},[x1],#16 587 st1 {v30.16b},[x1],#16 588 st1 {v31.16b},[x1],#16 589 590 b .Lecb_done 591 .align 4 592 .Loop3x_ecb_enc: 593 aese v0.16b,v16.16b 594 aesmc v0.16b,v0.16b 595 aese v1.16b,v16.16b 596 aesmc v1.16b,v1.16b 597 aese v24.16b,v16.16b 598 aesmc v24.16b,v24.16b 599 ld1 {v16.4s},[x7],#16 600 subs w6,w6,#2 601 aese v0.16b,v17.16b 602 aesmc v0.16b,v0.16b 603 aese v1.16b,v17.16b 604 aesmc v1.16b,v1.16b 605 aese v24.16b,v17.16b 606 aesmc v24.16b,v24.16b 607 ld1 {v17.4s},[x7],#16 608 b.gt .Loop3x_ecb_enc 609 610 aese v0.16b,v16.16b 611 aesmc v0.16b,v0.16b 612 aese v1.16b,v16.16b 613 aesmc v1.16b,v1.16b 614 aese v24.16b,v16.16b 615 aesmc v24.16b,v24.16b 616 subs x2,x2,#0x30 617 csel x6,x2,x6,lo // x6, w6, is zero at this point 618 aese v0.16b,v17.16b 619 aesmc v0.16b,v0.16b 620 aese v1.16b,v17.16b 621 aesmc v1.16b,v1.16b 622 aese v24.16b,v17.16b 623 aesmc v24.16b,v24.16b 624 add x0,x0,x6 // x0 is adjusted in such way that 625 // at exit from the loop v1.16b-v24.16b 626 // are loaded with last "words" 627 mov x7,x3 628 aese v0.16b,v20.16b 629 aesmc v0.16b,v0.16b 630 aese v1.16b,v20.16b 631 aesmc v1.16b,v1.16b 632 aese v24.16b,v20.16b 633 aesmc v24.16b,v24.16b 634 ld1 {v2.16b},[x0],#16 635 aese v0.16b,v21.16b 636 aesmc v0.16b,v0.16b 637 aese v1.16b,v21.16b 638 aesmc v1.16b,v1.16b 639 aese v24.16b,v21.16b 640 aesmc v24.16b,v24.16b 641 ld1 {v3.16b},[x0],#16 642 aese v0.16b,v22.16b 643 aesmc v0.16b,v0.16b 644 aese v1.16b,v22.16b 645 aesmc v1.16b,v1.16b 646 aese v24.16b,v22.16b 647 aesmc v24.16b,v24.16b 648 ld1 {v27.16b},[x0],#16 649 aese v0.16b,v23.16b 650 aese v1.16b,v23.16b 651 aese v24.16b,v23.16b 652 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 653 add w6,w5,#2 654 eor v4.16b,v7.16b,v0.16b 655 eor v5.16b,v7.16b,v1.16b 656 eor v24.16b,v24.16b,v7.16b 657 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 658 st1 {v4.16b},[x1],#16 659 orr v0.16b,v2.16b,v2.16b 660 st1 {v5.16b},[x1],#16 661 orr v1.16b,v3.16b,v3.16b 662 st1 {v24.16b},[x1],#16 663 orr v24.16b,v27.16b,v27.16b 664 b.hs .Loop3x_ecb_enc 665 666 cmn x2,#0x30 667 b.eq .Lecb_done 668 nop 669 670 .Lecb_enc_tail: 671 aese v1.16b,v16.16b 672 aesmc v1.16b,v1.16b 673 aese v24.16b,v16.16b 674 aesmc v24.16b,v24.16b 675 ld1 {v16.4s},[x7],#16 676 subs w6,w6,#2 677 aese v1.16b,v17.16b 678 aesmc v1.16b,v1.16b 679 aese v24.16b,v17.16b 680 aesmc v24.16b,v24.16b 681 ld1 {v17.4s},[x7],#16 682 b.gt .Lecb_enc_tail 683 684 aese v1.16b,v16.16b 685 aesmc v1.16b,v1.16b 686 aese v24.16b,v16.16b 687 aesmc v24.16b,v24.16b 688 aese v1.16b,v17.16b 689 aesmc v1.16b,v1.16b 690 aese v24.16b,v17.16b 691 aesmc v24.16b,v24.16b 692 aese v1.16b,v20.16b 693 aesmc v1.16b,v1.16b 694 aese v24.16b,v20.16b 695 aesmc v24.16b,v24.16b 696 cmn x2,#0x20 697 aese v1.16b,v21.16b 698 aesmc v1.16b,v1.16b 699 aese v24.16b,v21.16b 700 aesmc v24.16b,v24.16b 701 aese v1.16b,v22.16b 702 aesmc v1.16b,v1.16b 703 aese v24.16b,v22.16b 704 aesmc v24.16b,v24.16b 705 aese v1.16b,v23.16b 706 aese v24.16b,v23.16b 707 b.eq .Lecb_enc_one 708 eor v5.16b,v7.16b,v1.16b 709 eor v17.16b,v7.16b,v24.16b 710 st1 {v5.16b},[x1],#16 711 st1 {v17.16b},[x1],#16 712 b .Lecb_done 713 714 .Lecb_enc_one: 715 eor v5.16b,v7.16b,v24.16b 716 st1 {v5.16b},[x1],#16 717 b .Lecb_done 718 .align 5 719 .Lecb_dec: 720 ld1 {v1.16b},[x0],#16 721 subs x2,x2,#32 // bias 722 add w6,w5,#2 723 orr v3.16b,v1.16b,v1.16b 724 orr v24.16b,v1.16b,v1.16b 725 orr v1.16b,v0.16b,v0.16b 726 b.lo .Lecb_dec_tail 727 728 orr v1.16b,v3.16b,v3.16b 729 ld1 {v24.16b},[x0],#16 730 cmp x2,#32 731 b.lo .Loop3x_ecb_dec 732 733 ld1 {v25.16b},[x0],#16 734 ld1 {v26.16b},[x0],#16 735 sub x2,x2,#32 // bias 736 mov w6,w5 737 738 .Loop5x_ecb_dec: 739 aesd v0.16b,v16.16b 740 aesimc v0.16b,v0.16b 741 aesd v1.16b,v16.16b 742 aesimc v1.16b,v1.16b 743 aesd v24.16b,v16.16b 744 aesimc v24.16b,v24.16b 745 aesd v25.16b,v16.16b 746 aesimc v25.16b,v25.16b 747 aesd v26.16b,v16.16b 748 aesimc v26.16b,v26.16b 749 ld1 {v16.4s},[x7],#16 750 subs w6,w6,#2 751 aesd v0.16b,v17.16b 752 aesimc v0.16b,v0.16b 753 aesd v1.16b,v17.16b 754 aesimc v1.16b,v1.16b 755 aesd v24.16b,v17.16b 756 aesimc v24.16b,v24.16b 757 aesd v25.16b,v17.16b 758 aesimc v25.16b,v25.16b 759 aesd v26.16b,v17.16b 760 aesimc v26.16b,v26.16b 761 ld1 {v17.4s},[x7],#16 762 b.gt .Loop5x_ecb_dec 763 764 aesd v0.16b,v16.16b 765 aesimc v0.16b,v0.16b 766 aesd v1.16b,v16.16b 767 aesimc v1.16b,v1.16b 768 aesd v24.16b,v16.16b 769 aesimc v24.16b,v24.16b 770 aesd v25.16b,v16.16b 771 aesimc v25.16b,v25.16b 772 aesd v26.16b,v16.16b 773 aesimc v26.16b,v26.16b 774 cmp x2,#0x40 // because .Lecb_tail4x 775 sub x2,x2,#0x50 776 777 aesd v0.16b,v17.16b 778 aesimc v0.16b,v0.16b 779 aesd v1.16b,v17.16b 780 aesimc v1.16b,v1.16b 781 aesd v24.16b,v17.16b 782 aesimc v24.16b,v24.16b 783 aesd v25.16b,v17.16b 784 aesimc v25.16b,v25.16b 785 aesd v26.16b,v17.16b 786 aesimc v26.16b,v26.16b 787 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 788 mov x7,x3 789 790 aesd v0.16b,v18.16b 791 aesimc v0.16b,v0.16b 792 aesd v1.16b,v18.16b 793 aesimc v1.16b,v1.16b 794 aesd v24.16b,v18.16b 795 aesimc v24.16b,v24.16b 796 aesd v25.16b,v18.16b 797 aesimc v25.16b,v25.16b 798 aesd v26.16b,v18.16b 799 aesimc v26.16b,v26.16b 800 add x0,x0,x6 // x0 is adjusted in such way that 801 // at exit from the loop v1.16b-v26.16b 802 // are loaded with last "words" 803 add x6,x2,#0x60 // because .Lecb_tail4x 804 805 aesd v0.16b,v19.16b 806 aesimc v0.16b,v0.16b 807 aesd v1.16b,v19.16b 808 aesimc v1.16b,v1.16b 809 aesd v24.16b,v19.16b 810 aesimc v24.16b,v24.16b 811 aesd v25.16b,v19.16b 812 aesimc v25.16b,v25.16b 813 aesd v26.16b,v19.16b 814 aesimc v26.16b,v26.16b 815 816 aesd v0.16b,v20.16b 817 aesimc v0.16b,v0.16b 818 aesd v1.16b,v20.16b 819 aesimc v1.16b,v1.16b 820 aesd v24.16b,v20.16b 821 aesimc v24.16b,v24.16b 822 aesd v25.16b,v20.16b 823 aesimc v25.16b,v25.16b 824 aesd v26.16b,v20.16b 825 aesimc v26.16b,v26.16b 826 827 aesd v0.16b,v21.16b 828 aesimc v0.16b,v0.16b 829 aesd v1.16b,v21.16b 830 aesimc v1.16b,v1.16b 831 aesd v24.16b,v21.16b 832 aesimc v24.16b,v24.16b 833 aesd v25.16b,v21.16b 834 aesimc v25.16b,v25.16b 835 aesd v26.16b,v21.16b 836 aesimc v26.16b,v26.16b 837 838 aesd v0.16b,v22.16b 839 aesimc v0.16b,v0.16b 840 aesd v1.16b,v22.16b 841 aesimc v1.16b,v1.16b 842 aesd v24.16b,v22.16b 843 aesimc v24.16b,v24.16b 844 aesd v25.16b,v22.16b 845 aesimc v25.16b,v25.16b 846 aesd v26.16b,v22.16b 847 aesimc v26.16b,v26.16b 848 849 aesd v0.16b,v23.16b 850 ld1 {v2.16b},[x0],#16 851 aesd v1.16b,v23.16b 852 ld1 {v3.16b},[x0],#16 853 aesd v24.16b,v23.16b 854 ld1 {v27.16b},[x0],#16 855 aesd v25.16b,v23.16b 856 ld1 {v28.16b},[x0],#16 857 aesd v26.16b,v23.16b 858 ld1 {v29.16b},[x0],#16 859 cbz x6,.Lecb_tail4x 860 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 861 eor v4.16b,v7.16b,v0.16b 862 orr v0.16b,v2.16b,v2.16b 863 eor v5.16b,v7.16b,v1.16b 864 orr v1.16b,v3.16b,v3.16b 865 eor v17.16b,v7.16b,v24.16b 866 orr v24.16b,v27.16b,v27.16b 867 eor v30.16b,v7.16b,v25.16b 868 orr v25.16b,v28.16b,v28.16b 869 eor v31.16b,v7.16b,v26.16b 870 st1 {v4.16b},[x1],#16 871 orr v26.16b,v29.16b,v29.16b 872 st1 {v5.16b},[x1],#16 873 mov w6,w5 874 st1 {v17.16b},[x1],#16 875 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 876 st1 {v30.16b},[x1],#16 877 st1 {v31.16b},[x1],#16 878 b.hs .Loop5x_ecb_dec 879 880 add x2,x2,#0x50 881 cbz x2,.Lecb_done 882 883 add w6,w5,#2 884 subs x2,x2,#0x30 885 orr v0.16b,v27.16b,v27.16b 886 orr v1.16b,v28.16b,v28.16b 887 orr v24.16b,v29.16b,v29.16b 888 b.lo .Lecb_dec_tail 889 890 b .Loop3x_ecb_dec 891 892 .align 4 893 .Lecb_tail4x: 894 eor v5.16b,v7.16b,v1.16b 895 eor v17.16b,v7.16b,v24.16b 896 eor v30.16b,v7.16b,v25.16b 897 eor v31.16b,v7.16b,v26.16b 898 st1 {v5.16b},[x1],#16 899 st1 {v17.16b},[x1],#16 900 st1 {v30.16b},[x1],#16 901 st1 {v31.16b},[x1],#16 902 903 b .Lecb_done 904 .align 4 905 .Loop3x_ecb_dec: 906 aesd v0.16b,v16.16b 907 aesimc v0.16b,v0.16b 908 aesd v1.16b,v16.16b 909 aesimc v1.16b,v1.16b 910 aesd v24.16b,v16.16b 911 aesimc v24.16b,v24.16b 912 ld1 {v16.4s},[x7],#16 913 subs w6,w6,#2 914 aesd v0.16b,v17.16b 915 aesimc v0.16b,v0.16b 916 aesd v1.16b,v17.16b 917 aesimc v1.16b,v1.16b 918 aesd v24.16b,v17.16b 919 aesimc v24.16b,v24.16b 920 ld1 {v17.4s},[x7],#16 921 b.gt .Loop3x_ecb_dec 922 923 aesd v0.16b,v16.16b 924 aesimc v0.16b,v0.16b 925 aesd v1.16b,v16.16b 926 aesimc v1.16b,v1.16b 927 aesd v24.16b,v16.16b 928 aesimc v24.16b,v24.16b 929 subs x2,x2,#0x30 930 csel x6,x2,x6,lo // x6, w6, is zero at this point 931 aesd v0.16b,v17.16b 932 aesimc v0.16b,v0.16b 933 aesd v1.16b,v17.16b 934 aesimc v1.16b,v1.16b 935 aesd v24.16b,v17.16b 936 aesimc v24.16b,v24.16b 937 add x0,x0,x6 // x0 is adjusted in such way that 938 // at exit from the loop v1.16b-v24.16b 939 // are loaded with last "words" 940 mov x7,x3 941 aesd v0.16b,v20.16b 942 aesimc v0.16b,v0.16b 943 aesd v1.16b,v20.16b 944 aesimc v1.16b,v1.16b 945 aesd v24.16b,v20.16b 946 aesimc v24.16b,v24.16b 947 ld1 {v2.16b},[x0],#16 948 aesd v0.16b,v21.16b 949 aesimc v0.16b,v0.16b 950 aesd v1.16b,v21.16b 951 aesimc v1.16b,v1.16b 952 aesd v24.16b,v21.16b 953 aesimc v24.16b,v24.16b 954 ld1 {v3.16b},[x0],#16 955 aesd v0.16b,v22.16b 956 aesimc v0.16b,v0.16b 957 aesd v1.16b,v22.16b 958 aesimc v1.16b,v1.16b 959 aesd v24.16b,v22.16b 960 aesimc v24.16b,v24.16b 961 ld1 {v27.16b},[x0],#16 962 aesd v0.16b,v23.16b 963 aesd v1.16b,v23.16b 964 aesd v24.16b,v23.16b 965 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 966 add w6,w5,#2 967 eor v4.16b,v7.16b,v0.16b 968 eor v5.16b,v7.16b,v1.16b 969 eor v24.16b,v24.16b,v7.16b 970 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 971 st1 {v4.16b},[x1],#16 972 orr v0.16b,v2.16b,v2.16b 973 st1 {v5.16b},[x1],#16 974 orr v1.16b,v3.16b,v3.16b 975 st1 {v24.16b},[x1],#16 976 orr v24.16b,v27.16b,v27.16b 977 b.hs .Loop3x_ecb_dec 978 979 cmn x2,#0x30 980 b.eq .Lecb_done 981 nop 982 983 .Lecb_dec_tail: 984 aesd v1.16b,v16.16b 985 aesimc v1.16b,v1.16b 986 aesd v24.16b,v16.16b 987 aesimc v24.16b,v24.16b 988 ld1 {v16.4s},[x7],#16 989 subs w6,w6,#2 990 aesd v1.16b,v17.16b 991 aesimc v1.16b,v1.16b 992 aesd v24.16b,v17.16b 993 aesimc v24.16b,v24.16b 994 ld1 {v17.4s},[x7],#16 995 b.gt .Lecb_dec_tail 996 997 aesd v1.16b,v16.16b 998 aesimc v1.16b,v1.16b 999 aesd v24.16b,v16.16b 1000 aesimc v24.16b,v24.16b 1001 aesd v1.16b,v17.16b 1002 aesimc v1.16b,v1.16b 1003 aesd v24.16b,v17.16b 1004 aesimc v24.16b,v24.16b 1005 aesd v1.16b,v20.16b 1006 aesimc v1.16b,v1.16b 1007 aesd v24.16b,v20.16b 1008 aesimc v24.16b,v24.16b 1009 cmn x2,#0x20 1010 aesd v1.16b,v21.16b 1011 aesimc v1.16b,v1.16b 1012 aesd v24.16b,v21.16b 1013 aesimc v24.16b,v24.16b 1014 aesd v1.16b,v22.16b 1015 aesimc v1.16b,v1.16b 1016 aesd v24.16b,v22.16b 1017 aesimc v24.16b,v24.16b 1018 aesd v1.16b,v23.16b 1019 aesd v24.16b,v23.16b 1020 b.eq .Lecb_dec_one 1021 eor v5.16b,v7.16b,v1.16b 1022 eor v17.16b,v7.16b,v24.16b 1023 st1 {v5.16b},[x1],#16 1024 st1 {v17.16b},[x1],#16 1025 b .Lecb_done 1026 1027 .Lecb_dec_one: 1028 eor v5.16b,v7.16b,v24.16b 1029 st1 {v5.16b},[x1],#16 1030 1031 .Lecb_done: 1032 ldr x29,[sp],#16 1033 .Lecb_Final_abort: 1034 ret 1035 .size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt 1036 .globl aes_v8_cbc_encrypt 1037 .type aes_v8_cbc_encrypt,%function 1038 .align 5 1039 aes_v8_cbc_encrypt: 1040 AARCH64_VALID_CALL_TARGET 1041 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1042 stp x29,x30,[sp,#-16]! 1043 add x29,sp,#0 1044 subs x2,x2,#16 1045 mov x8,#16 1046 b.lo .Lcbc_abort 1047 csel x8,xzr,x8,eq 1048 1049 cmp w5,#0 // en- or decrypting? 1050 ldr w5,[x3,#240] 1051 and x2,x2,#-16 1052 ld1 {v6.16b},[x4] 1053 ld1 {v0.16b},[x0],x8 1054 1055 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1056 sub w5,w5,#6 1057 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 1058 sub w5,w5,#2 1059 ld1 {v18.4s,v19.4s},[x7],#32 1060 ld1 {v20.4s,v21.4s},[x7],#32 1061 ld1 {v22.4s,v23.4s},[x7],#32 1062 ld1 {v7.4s},[x7] 1063 1064 add x7,x3,#32 1065 mov w6,w5 1066 b.eq .Lcbc_dec 1067 1068 cmp w5,#2 1069 eor v0.16b,v0.16b,v6.16b 1070 eor v5.16b,v16.16b,v7.16b 1071 b.eq .Lcbc_enc128 1072 1073 ld1 {v2.4s,v3.4s},[x7] 1074 add x7,x3,#16 1075 add x6,x3,#16*4 1076 add x12,x3,#16*5 1077 aese v0.16b,v16.16b 1078 aesmc v0.16b,v0.16b 1079 add x14,x3,#16*6 1080 add x3,x3,#16*7 1081 b .Lenter_cbc_enc 1082 1083 .align 4 1084 .Loop_cbc_enc: 1085 aese v0.16b,v16.16b 1086 aesmc v0.16b,v0.16b 1087 st1 {v6.16b},[x1],#16 1088 .Lenter_cbc_enc: 1089 aese v0.16b,v17.16b 1090 aesmc v0.16b,v0.16b 1091 aese v0.16b,v2.16b 1092 aesmc v0.16b,v0.16b 1093 ld1 {v16.4s},[x6] 1094 cmp w5,#4 1095 aese v0.16b,v3.16b 1096 aesmc v0.16b,v0.16b 1097 ld1 {v17.4s},[x12] 1098 b.eq .Lcbc_enc192 1099 1100 aese v0.16b,v16.16b 1101 aesmc v0.16b,v0.16b 1102 ld1 {v16.4s},[x14] 1103 aese v0.16b,v17.16b 1104 aesmc v0.16b,v0.16b 1105 ld1 {v17.4s},[x3] 1106 nop 1107 1108 .Lcbc_enc192: 1109 aese v0.16b,v16.16b 1110 aesmc v0.16b,v0.16b 1111 subs x2,x2,#16 1112 aese v0.16b,v17.16b 1113 aesmc v0.16b,v0.16b 1114 csel x8,xzr,x8,eq 1115 aese v0.16b,v18.16b 1116 aesmc v0.16b,v0.16b 1117 aese v0.16b,v19.16b 1118 aesmc v0.16b,v0.16b 1119 ld1 {v16.16b},[x0],x8 1120 aese v0.16b,v20.16b 1121 aesmc v0.16b,v0.16b 1122 eor v16.16b,v16.16b,v5.16b 1123 aese v0.16b,v21.16b 1124 aesmc v0.16b,v0.16b 1125 ld1 {v17.4s},[x7] // re-pre-load rndkey[1] 1126 aese v0.16b,v22.16b 1127 aesmc v0.16b,v0.16b 1128 aese v0.16b,v23.16b 1129 eor v6.16b,v0.16b,v7.16b 1130 b.hs .Loop_cbc_enc 1131 1132 st1 {v6.16b},[x1],#16 1133 b .Lcbc_done 1134 1135 .align 5 1136 .Lcbc_enc128: 1137 ld1 {v2.4s,v3.4s},[x7] 1138 aese v0.16b,v16.16b 1139 aesmc v0.16b,v0.16b 1140 b .Lenter_cbc_enc128 1141 .Loop_cbc_enc128: 1142 aese v0.16b,v16.16b 1143 aesmc v0.16b,v0.16b 1144 st1 {v6.16b},[x1],#16 1145 .Lenter_cbc_enc128: 1146 aese v0.16b,v17.16b 1147 aesmc v0.16b,v0.16b 1148 subs x2,x2,#16 1149 aese v0.16b,v2.16b 1150 aesmc v0.16b,v0.16b 1151 csel x8,xzr,x8,eq 1152 aese v0.16b,v3.16b 1153 aesmc v0.16b,v0.16b 1154 aese v0.16b,v18.16b 1155 aesmc v0.16b,v0.16b 1156 aese v0.16b,v19.16b 1157 aesmc v0.16b,v0.16b 1158 ld1 {v16.16b},[x0],x8 1159 aese v0.16b,v20.16b 1160 aesmc v0.16b,v0.16b 1161 aese v0.16b,v21.16b 1162 aesmc v0.16b,v0.16b 1163 aese v0.16b,v22.16b 1164 aesmc v0.16b,v0.16b 1165 eor v16.16b,v16.16b,v5.16b 1166 aese v0.16b,v23.16b 1167 eor v6.16b,v0.16b,v7.16b 1168 b.hs .Loop_cbc_enc128 1169 1170 st1 {v6.16b},[x1],#16 1171 b .Lcbc_done 1172 .align 5 1173 .Lcbc_dec: 1174 ld1 {v24.16b},[x0],#16 1175 subs x2,x2,#32 // bias 1176 add w6,w5,#2 1177 orr v3.16b,v0.16b,v0.16b 1178 orr v1.16b,v0.16b,v0.16b 1179 orr v27.16b,v24.16b,v24.16b 1180 b.lo .Lcbc_dec_tail 1181 1182 orr v1.16b,v24.16b,v24.16b 1183 ld1 {v24.16b},[x0],#16 1184 orr v2.16b,v0.16b,v0.16b 1185 orr v3.16b,v1.16b,v1.16b 1186 orr v27.16b,v24.16b,v24.16b 1187 cmp x2,#32 1188 b.lo .Loop3x_cbc_dec 1189 1190 ld1 {v25.16b},[x0],#16 1191 ld1 {v26.16b},[x0],#16 1192 sub x2,x2,#32 // bias 1193 mov w6,w5 1194 orr v28.16b,v25.16b,v25.16b 1195 orr v29.16b,v26.16b,v26.16b 1196 1197 .Loop5x_cbc_dec: 1198 aesd v0.16b,v16.16b 1199 aesimc v0.16b,v0.16b 1200 aesd v1.16b,v16.16b 1201 aesimc v1.16b,v1.16b 1202 aesd v24.16b,v16.16b 1203 aesimc v24.16b,v24.16b 1204 aesd v25.16b,v16.16b 1205 aesimc v25.16b,v25.16b 1206 aesd v26.16b,v16.16b 1207 aesimc v26.16b,v26.16b 1208 ld1 {v16.4s},[x7],#16 1209 subs w6,w6,#2 1210 aesd v0.16b,v17.16b 1211 aesimc v0.16b,v0.16b 1212 aesd v1.16b,v17.16b 1213 aesimc v1.16b,v1.16b 1214 aesd v24.16b,v17.16b 1215 aesimc v24.16b,v24.16b 1216 aesd v25.16b,v17.16b 1217 aesimc v25.16b,v25.16b 1218 aesd v26.16b,v17.16b 1219 aesimc v26.16b,v26.16b 1220 ld1 {v17.4s},[x7],#16 1221 b.gt .Loop5x_cbc_dec 1222 1223 aesd v0.16b,v16.16b 1224 aesimc v0.16b,v0.16b 1225 aesd v1.16b,v16.16b 1226 aesimc v1.16b,v1.16b 1227 aesd v24.16b,v16.16b 1228 aesimc v24.16b,v24.16b 1229 aesd v25.16b,v16.16b 1230 aesimc v25.16b,v25.16b 1231 aesd v26.16b,v16.16b 1232 aesimc v26.16b,v26.16b 1233 cmp x2,#0x40 // because .Lcbc_tail4x 1234 sub x2,x2,#0x50 1235 1236 aesd v0.16b,v17.16b 1237 aesimc v0.16b,v0.16b 1238 aesd v1.16b,v17.16b 1239 aesimc v1.16b,v1.16b 1240 aesd v24.16b,v17.16b 1241 aesimc v24.16b,v24.16b 1242 aesd v25.16b,v17.16b 1243 aesimc v25.16b,v25.16b 1244 aesd v26.16b,v17.16b 1245 aesimc v26.16b,v26.16b 1246 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 1247 mov x7,x3 1248 1249 aesd v0.16b,v18.16b 1250 aesimc v0.16b,v0.16b 1251 aesd v1.16b,v18.16b 1252 aesimc v1.16b,v1.16b 1253 aesd v24.16b,v18.16b 1254 aesimc v24.16b,v24.16b 1255 aesd v25.16b,v18.16b 1256 aesimc v25.16b,v25.16b 1257 aesd v26.16b,v18.16b 1258 aesimc v26.16b,v26.16b 1259 add x0,x0,x6 // x0 is adjusted in such way that 1260 // at exit from the loop v1.16b-v26.16b 1261 // are loaded with last "words" 1262 add x6,x2,#0x60 // because .Lcbc_tail4x 1263 1264 aesd v0.16b,v19.16b 1265 aesimc v0.16b,v0.16b 1266 aesd v1.16b,v19.16b 1267 aesimc v1.16b,v1.16b 1268 aesd v24.16b,v19.16b 1269 aesimc v24.16b,v24.16b 1270 aesd v25.16b,v19.16b 1271 aesimc v25.16b,v25.16b 1272 aesd v26.16b,v19.16b 1273 aesimc v26.16b,v26.16b 1274 1275 aesd v0.16b,v20.16b 1276 aesimc v0.16b,v0.16b 1277 aesd v1.16b,v20.16b 1278 aesimc v1.16b,v1.16b 1279 aesd v24.16b,v20.16b 1280 aesimc v24.16b,v24.16b 1281 aesd v25.16b,v20.16b 1282 aesimc v25.16b,v25.16b 1283 aesd v26.16b,v20.16b 1284 aesimc v26.16b,v26.16b 1285 1286 aesd v0.16b,v21.16b 1287 aesimc v0.16b,v0.16b 1288 aesd v1.16b,v21.16b 1289 aesimc v1.16b,v1.16b 1290 aesd v24.16b,v21.16b 1291 aesimc v24.16b,v24.16b 1292 aesd v25.16b,v21.16b 1293 aesimc v25.16b,v25.16b 1294 aesd v26.16b,v21.16b 1295 aesimc v26.16b,v26.16b 1296 1297 aesd v0.16b,v22.16b 1298 aesimc v0.16b,v0.16b 1299 aesd v1.16b,v22.16b 1300 aesimc v1.16b,v1.16b 1301 aesd v24.16b,v22.16b 1302 aesimc v24.16b,v24.16b 1303 aesd v25.16b,v22.16b 1304 aesimc v25.16b,v25.16b 1305 aesd v26.16b,v22.16b 1306 aesimc v26.16b,v26.16b 1307 1308 eor v4.16b,v6.16b,v7.16b 1309 aesd v0.16b,v23.16b 1310 eor v5.16b,v2.16b,v7.16b 1311 ld1 {v2.16b},[x0],#16 1312 aesd v1.16b,v23.16b 1313 eor v17.16b,v3.16b,v7.16b 1314 ld1 {v3.16b},[x0],#16 1315 aesd v24.16b,v23.16b 1316 eor v30.16b,v27.16b,v7.16b 1317 ld1 {v27.16b},[x0],#16 1318 aesd v25.16b,v23.16b 1319 eor v31.16b,v28.16b,v7.16b 1320 ld1 {v28.16b},[x0],#16 1321 aesd v26.16b,v23.16b 1322 orr v6.16b,v29.16b,v29.16b 1323 ld1 {v29.16b},[x0],#16 1324 cbz x6,.Lcbc_tail4x 1325 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1326 eor v4.16b,v4.16b,v0.16b 1327 orr v0.16b,v2.16b,v2.16b 1328 eor v5.16b,v5.16b,v1.16b 1329 orr v1.16b,v3.16b,v3.16b 1330 eor v17.16b,v17.16b,v24.16b 1331 orr v24.16b,v27.16b,v27.16b 1332 eor v30.16b,v30.16b,v25.16b 1333 orr v25.16b,v28.16b,v28.16b 1334 eor v31.16b,v31.16b,v26.16b 1335 st1 {v4.16b},[x1],#16 1336 orr v26.16b,v29.16b,v29.16b 1337 st1 {v5.16b},[x1],#16 1338 mov w6,w5 1339 st1 {v17.16b},[x1],#16 1340 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1341 st1 {v30.16b},[x1],#16 1342 st1 {v31.16b},[x1],#16 1343 b.hs .Loop5x_cbc_dec 1344 1345 add x2,x2,#0x50 1346 cbz x2,.Lcbc_done 1347 1348 add w6,w5,#2 1349 subs x2,x2,#0x30 1350 orr v0.16b,v27.16b,v27.16b 1351 orr v2.16b,v27.16b,v27.16b 1352 orr v1.16b,v28.16b,v28.16b 1353 orr v3.16b,v28.16b,v28.16b 1354 orr v24.16b,v29.16b,v29.16b 1355 orr v27.16b,v29.16b,v29.16b 1356 b.lo .Lcbc_dec_tail 1357 1358 b .Loop3x_cbc_dec 1359 1360 .align 4 1361 .Lcbc_tail4x: 1362 eor v5.16b,v4.16b,v1.16b 1363 eor v17.16b,v17.16b,v24.16b 1364 eor v30.16b,v30.16b,v25.16b 1365 eor v31.16b,v31.16b,v26.16b 1366 st1 {v5.16b},[x1],#16 1367 st1 {v17.16b},[x1],#16 1368 st1 {v30.16b},[x1],#16 1369 st1 {v31.16b},[x1],#16 1370 1371 b .Lcbc_done 1372 .align 4 1373 .Loop3x_cbc_dec: 1374 aesd v0.16b,v16.16b 1375 aesimc v0.16b,v0.16b 1376 aesd v1.16b,v16.16b 1377 aesimc v1.16b,v1.16b 1378 aesd v24.16b,v16.16b 1379 aesimc v24.16b,v24.16b 1380 ld1 {v16.4s},[x7],#16 1381 subs w6,w6,#2 1382 aesd v0.16b,v17.16b 1383 aesimc v0.16b,v0.16b 1384 aesd v1.16b,v17.16b 1385 aesimc v1.16b,v1.16b 1386 aesd v24.16b,v17.16b 1387 aesimc v24.16b,v24.16b 1388 ld1 {v17.4s},[x7],#16 1389 b.gt .Loop3x_cbc_dec 1390 1391 aesd v0.16b,v16.16b 1392 aesimc v0.16b,v0.16b 1393 aesd v1.16b,v16.16b 1394 aesimc v1.16b,v1.16b 1395 aesd v24.16b,v16.16b 1396 aesimc v24.16b,v24.16b 1397 eor v4.16b,v6.16b,v7.16b 1398 subs x2,x2,#0x30 1399 eor v5.16b,v2.16b,v7.16b 1400 csel x6,x2,x6,lo // x6, w6, is zero at this point 1401 aesd v0.16b,v17.16b 1402 aesimc v0.16b,v0.16b 1403 aesd v1.16b,v17.16b 1404 aesimc v1.16b,v1.16b 1405 aesd v24.16b,v17.16b 1406 aesimc v24.16b,v24.16b 1407 eor v17.16b,v3.16b,v7.16b 1408 add x0,x0,x6 // x0 is adjusted in such way that 1409 // at exit from the loop v1.16b-v24.16b 1410 // are loaded with last "words" 1411 orr v6.16b,v27.16b,v27.16b 1412 mov x7,x3 1413 aesd v0.16b,v20.16b 1414 aesimc v0.16b,v0.16b 1415 aesd v1.16b,v20.16b 1416 aesimc v1.16b,v1.16b 1417 aesd v24.16b,v20.16b 1418 aesimc v24.16b,v24.16b 1419 ld1 {v2.16b},[x0],#16 1420 aesd v0.16b,v21.16b 1421 aesimc v0.16b,v0.16b 1422 aesd v1.16b,v21.16b 1423 aesimc v1.16b,v1.16b 1424 aesd v24.16b,v21.16b 1425 aesimc v24.16b,v24.16b 1426 ld1 {v3.16b},[x0],#16 1427 aesd v0.16b,v22.16b 1428 aesimc v0.16b,v0.16b 1429 aesd v1.16b,v22.16b 1430 aesimc v1.16b,v1.16b 1431 aesd v24.16b,v22.16b 1432 aesimc v24.16b,v24.16b 1433 ld1 {v27.16b},[x0],#16 1434 aesd v0.16b,v23.16b 1435 aesd v1.16b,v23.16b 1436 aesd v24.16b,v23.16b 1437 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1438 add w6,w5,#2 1439 eor v4.16b,v4.16b,v0.16b 1440 eor v5.16b,v5.16b,v1.16b 1441 eor v24.16b,v24.16b,v17.16b 1442 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1443 st1 {v4.16b},[x1],#16 1444 orr v0.16b,v2.16b,v2.16b 1445 st1 {v5.16b},[x1],#16 1446 orr v1.16b,v3.16b,v3.16b 1447 st1 {v24.16b},[x1],#16 1448 orr v24.16b,v27.16b,v27.16b 1449 b.hs .Loop3x_cbc_dec 1450 1451 cmn x2,#0x30 1452 b.eq .Lcbc_done 1453 nop 1454 1455 .Lcbc_dec_tail: 1456 aesd v1.16b,v16.16b 1457 aesimc v1.16b,v1.16b 1458 aesd v24.16b,v16.16b 1459 aesimc v24.16b,v24.16b 1460 ld1 {v16.4s},[x7],#16 1461 subs w6,w6,#2 1462 aesd v1.16b,v17.16b 1463 aesimc v1.16b,v1.16b 1464 aesd v24.16b,v17.16b 1465 aesimc v24.16b,v24.16b 1466 ld1 {v17.4s},[x7],#16 1467 b.gt .Lcbc_dec_tail 1468 1469 aesd v1.16b,v16.16b 1470 aesimc v1.16b,v1.16b 1471 aesd v24.16b,v16.16b 1472 aesimc v24.16b,v24.16b 1473 aesd v1.16b,v17.16b 1474 aesimc v1.16b,v1.16b 1475 aesd v24.16b,v17.16b 1476 aesimc v24.16b,v24.16b 1477 aesd v1.16b,v20.16b 1478 aesimc v1.16b,v1.16b 1479 aesd v24.16b,v20.16b 1480 aesimc v24.16b,v24.16b 1481 cmn x2,#0x20 1482 aesd v1.16b,v21.16b 1483 aesimc v1.16b,v1.16b 1484 aesd v24.16b,v21.16b 1485 aesimc v24.16b,v24.16b 1486 eor v5.16b,v6.16b,v7.16b 1487 aesd v1.16b,v22.16b 1488 aesimc v1.16b,v1.16b 1489 aesd v24.16b,v22.16b 1490 aesimc v24.16b,v24.16b 1491 eor v17.16b,v3.16b,v7.16b 1492 aesd v1.16b,v23.16b 1493 aesd v24.16b,v23.16b 1494 b.eq .Lcbc_dec_one 1495 eor v5.16b,v5.16b,v1.16b 1496 eor v17.16b,v17.16b,v24.16b 1497 orr v6.16b,v27.16b,v27.16b 1498 st1 {v5.16b},[x1],#16 1499 st1 {v17.16b},[x1],#16 1500 b .Lcbc_done 1501 1502 .Lcbc_dec_one: 1503 eor v5.16b,v5.16b,v24.16b 1504 orr v6.16b,v27.16b,v27.16b 1505 st1 {v5.16b},[x1],#16 1506 1507 .Lcbc_done: 1508 st1 {v6.16b},[x4] 1509 .Lcbc_abort: 1510 ldr x29,[sp],#16 1511 ret 1512 .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt 1513 .globl aes_v8_ctr32_encrypt_blocks_unroll12_eor3 1514 .type aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function 1515 .align 5 1516 aes_v8_ctr32_encrypt_blocks_unroll12_eor3: 1517 AARCH64_VALID_CALL_TARGET 1518 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1519 stp x29,x30,[sp,#-80]! 1520 stp d8,d9,[sp, #16] 1521 stp d10,d11,[sp, #32] 1522 stp d12,d13,[sp, #48] 1523 stp d14,d15,[sp, #64] 1524 add x29,sp,#0 1525 1526 ldr w5,[x3,#240] 1527 1528 ldr w8, [x4, #12] 1529 #ifdef __AARCH64EB__ 1530 ld1 {v24.16b},[x4] 1531 #else 1532 ld1 {v24.4s},[x4] 1533 #endif 1534 ld1 {v2.4s,v3.4s},[x3] // load key schedule... 1535 sub w5,w5,#4 1536 cmp x2,#2 1537 add x7,x3,x5,lsl#4 // pointer to last round key 1538 sub w5,w5,#2 1539 add x7, x7, #64 1540 ld1 {v1.4s},[x7] 1541 add x7,x3,#32 1542 mov w6,w5 1543 #ifndef __AARCH64EB__ 1544 rev w8, w8 1545 #endif 1546 1547 orr v25.16b,v24.16b,v24.16b 1548 add w10, w8, #1 1549 orr v26.16b,v24.16b,v24.16b 1550 add w8, w8, #2 1551 orr v0.16b,v24.16b,v24.16b 1552 rev w10, w10 1553 mov v25.s[3],w10 1554 b.ls .Lctr32_tail_unroll 1555 cmp x2,#6 1556 rev w12, w8 1557 sub x2,x2,#3 // bias 1558 mov v26.s[3],w12 1559 b.lo .Loop3x_ctr32_unroll 1560 cmp x2,#9 1561 orr v27.16b,v24.16b,v24.16b 1562 add w11, w8, #1 1563 orr v28.16b,v24.16b,v24.16b 1564 add w13, w8, #2 1565 rev w11, w11 1566 orr v29.16b,v24.16b,v24.16b 1567 add w8, w8, #3 1568 rev w13, w13 1569 mov v27.s[3],w11 1570 rev w14, w8 1571 mov v28.s[3],w13 1572 mov v29.s[3],w14 1573 sub x2,x2,#3 1574 b.lo .Loop6x_ctr32_unroll 1575 1576 // push regs to stack when 12 data chunks are interleaved 1577 stp x19,x20,[sp,#-16]! 1578 stp x21,x22,[sp,#-16]! 1579 stp x23,x24,[sp,#-16]! 1580 stp d8,d9,[sp,#-32]! 1581 stp d10,d11,[sp,#-32]! 1582 1583 add w15,w8,#1 1584 add w19,w8,#2 1585 add w20,w8,#3 1586 add w21,w8,#4 1587 add w22,w8,#5 1588 add w8,w8,#6 1589 orr v30.16b,v24.16b,v24.16b 1590 rev w15,w15 1591 orr v31.16b,v24.16b,v24.16b 1592 rev w19,w19 1593 orr v8.16b,v24.16b,v24.16b 1594 rev w20,w20 1595 orr v9.16b,v24.16b,v24.16b 1596 rev w21,w21 1597 orr v10.16b,v24.16b,v24.16b 1598 rev w22,w22 1599 orr v11.16b,v24.16b,v24.16b 1600 rev w23,w8 1601 1602 sub x2,x2,#6 // bias 1603 mov v30.s[3],w15 1604 mov v31.s[3],w19 1605 mov v8.s[3],w20 1606 mov v9.s[3],w21 1607 mov v10.s[3],w22 1608 mov v11.s[3],w23 1609 b .Loop12x_ctr32_unroll 1610 1611 .align 4 1612 .Loop12x_ctr32_unroll: 1613 aese v24.16b,v2.16b 1614 aesmc v24.16b,v24.16b 1615 aese v25.16b,v2.16b 1616 aesmc v25.16b,v25.16b 1617 aese v26.16b,v2.16b 1618 aesmc v26.16b,v26.16b 1619 aese v27.16b,v2.16b 1620 aesmc v27.16b,v27.16b 1621 aese v28.16b,v2.16b 1622 aesmc v28.16b,v28.16b 1623 aese v29.16b,v2.16b 1624 aesmc v29.16b,v29.16b 1625 aese v30.16b,v2.16b 1626 aesmc v30.16b,v30.16b 1627 aese v31.16b,v2.16b 1628 aesmc v31.16b,v31.16b 1629 aese v8.16b,v2.16b 1630 aesmc v8.16b,v8.16b 1631 aese v9.16b,v2.16b 1632 aesmc v9.16b,v9.16b 1633 aese v10.16b,v2.16b 1634 aesmc v10.16b,v10.16b 1635 aese v11.16b,v2.16b 1636 aesmc v11.16b,v11.16b 1637 ld1 {v2.4s},[x7],#16 1638 subs w6,w6,#2 1639 aese v24.16b,v3.16b 1640 aesmc v24.16b,v24.16b 1641 aese v25.16b,v3.16b 1642 aesmc v25.16b,v25.16b 1643 aese v26.16b,v3.16b 1644 aesmc v26.16b,v26.16b 1645 aese v27.16b,v3.16b 1646 aesmc v27.16b,v27.16b 1647 aese v28.16b,v3.16b 1648 aesmc v28.16b,v28.16b 1649 aese v29.16b,v3.16b 1650 aesmc v29.16b,v29.16b 1651 aese v30.16b,v3.16b 1652 aesmc v30.16b,v30.16b 1653 aese v31.16b,v3.16b 1654 aesmc v31.16b,v31.16b 1655 aese v8.16b,v3.16b 1656 aesmc v8.16b,v8.16b 1657 aese v9.16b,v3.16b 1658 aesmc v9.16b,v9.16b 1659 aese v10.16b,v3.16b 1660 aesmc v10.16b,v10.16b 1661 aese v11.16b,v3.16b 1662 aesmc v11.16b,v11.16b 1663 ld1 {v3.4s},[x7],#16 1664 b.gt .Loop12x_ctr32_unroll 1665 1666 aese v24.16b,v2.16b 1667 aesmc v24.16b,v24.16b 1668 aese v25.16b,v2.16b 1669 aesmc v25.16b,v25.16b 1670 aese v26.16b,v2.16b 1671 aesmc v26.16b,v26.16b 1672 aese v27.16b,v2.16b 1673 aesmc v27.16b,v27.16b 1674 aese v28.16b,v2.16b 1675 aesmc v28.16b,v28.16b 1676 aese v29.16b,v2.16b 1677 aesmc v29.16b,v29.16b 1678 aese v30.16b,v2.16b 1679 aesmc v30.16b,v30.16b 1680 aese v31.16b,v2.16b 1681 aesmc v31.16b,v31.16b 1682 aese v8.16b,v2.16b 1683 aesmc v8.16b,v8.16b 1684 aese v9.16b,v2.16b 1685 aesmc v9.16b,v9.16b 1686 aese v10.16b,v2.16b 1687 aesmc v10.16b,v10.16b 1688 aese v11.16b,v2.16b 1689 aesmc v11.16b,v11.16b 1690 ld1 {v2.4s},[x7],#16 1691 1692 aese v24.16b,v3.16b 1693 aesmc v24.16b,v24.16b 1694 aese v25.16b,v3.16b 1695 aesmc v25.16b,v25.16b 1696 aese v26.16b,v3.16b 1697 aesmc v26.16b,v26.16b 1698 aese v27.16b,v3.16b 1699 aesmc v27.16b,v27.16b 1700 aese v28.16b,v3.16b 1701 aesmc v28.16b,v28.16b 1702 aese v29.16b,v3.16b 1703 aesmc v29.16b,v29.16b 1704 aese v30.16b,v3.16b 1705 aesmc v30.16b,v30.16b 1706 aese v31.16b,v3.16b 1707 aesmc v31.16b,v31.16b 1708 aese v8.16b,v3.16b 1709 aesmc v8.16b,v8.16b 1710 aese v9.16b,v3.16b 1711 aesmc v9.16b,v9.16b 1712 aese v10.16b,v3.16b 1713 aesmc v10.16b,v10.16b 1714 aese v11.16b,v3.16b 1715 aesmc v11.16b,v11.16b 1716 ld1 {v3.4s},[x7],#16 1717 1718 aese v24.16b,v2.16b 1719 aesmc v24.16b,v24.16b 1720 add w9,w8,#1 1721 add w10,w8,#2 1722 aese v25.16b,v2.16b 1723 aesmc v25.16b,v25.16b 1724 add w12,w8,#3 1725 add w11,w8,#4 1726 aese v26.16b,v2.16b 1727 aesmc v26.16b,v26.16b 1728 add w13,w8,#5 1729 add w14,w8,#6 1730 rev w9,w9 1731 aese v27.16b,v2.16b 1732 aesmc v27.16b,v27.16b 1733 add w15,w8,#7 1734 add w19,w8,#8 1735 rev w10,w10 1736 rev w12,w12 1737 aese v28.16b,v2.16b 1738 aesmc v28.16b,v28.16b 1739 add w20,w8,#9 1740 add w21,w8,#10 1741 rev w11,w11 1742 rev w13,w13 1743 aese v29.16b,v2.16b 1744 aesmc v29.16b,v29.16b 1745 add w22,w8,#11 1746 add w23,w8,#12 1747 rev w14,w14 1748 rev w15,w15 1749 aese v30.16b,v2.16b 1750 aesmc v30.16b,v30.16b 1751 rev w19,w19 1752 rev w20,w20 1753 aese v31.16b,v2.16b 1754 aesmc v31.16b,v31.16b 1755 rev w21,w21 1756 rev w22,w22 1757 aese v8.16b,v2.16b 1758 aesmc v8.16b,v8.16b 1759 rev w23,w23 1760 aese v9.16b,v2.16b 1761 aesmc v9.16b,v9.16b 1762 aese v10.16b,v2.16b 1763 aesmc v10.16b,v10.16b 1764 aese v11.16b,v2.16b 1765 aesmc v11.16b,v11.16b 1766 ld1 {v2.4s},[x7],#16 1767 1768 aese v24.16b,v3.16b 1769 aesmc v24.16b,v24.16b 1770 aese v25.16b,v3.16b 1771 aesmc v25.16b,v25.16b 1772 aese v26.16b,v3.16b 1773 aesmc v26.16b,v26.16b 1774 aese v27.16b,v3.16b 1775 aesmc v27.16b,v27.16b 1776 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1777 aese v28.16b,v3.16b 1778 aesmc v28.16b,v28.16b 1779 aese v29.16b,v3.16b 1780 aesmc v29.16b,v29.16b 1781 aese v30.16b,v3.16b 1782 aesmc v30.16b,v30.16b 1783 aese v31.16b,v3.16b 1784 aesmc v31.16b,v31.16b 1785 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1786 aese v8.16b,v3.16b 1787 aesmc v8.16b,v8.16b 1788 aese v9.16b,v3.16b 1789 aesmc v9.16b,v9.16b 1790 aese v10.16b,v3.16b 1791 aesmc v10.16b,v10.16b 1792 aese v11.16b,v3.16b 1793 aesmc v11.16b,v11.16b 1794 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1795 ld1 {v3.4s},[x7],#16 1796 1797 mov x7, x3 1798 aese v24.16b,v2.16b 1799 aesmc v24.16b,v24.16b 1800 aese v25.16b,v2.16b 1801 aesmc v25.16b,v25.16b 1802 aese v26.16b,v2.16b 1803 aesmc v26.16b,v26.16b 1804 aese v27.16b,v2.16b 1805 aesmc v27.16b,v27.16b 1806 aese v28.16b,v2.16b 1807 aesmc v28.16b,v28.16b 1808 aese v29.16b,v2.16b 1809 aesmc v29.16b,v29.16b 1810 aese v30.16b,v2.16b 1811 aesmc v30.16b,v30.16b 1812 aese v31.16b,v2.16b 1813 aesmc v31.16b,v31.16b 1814 aese v8.16b,v2.16b 1815 aesmc v8.16b,v8.16b 1816 aese v9.16b,v2.16b 1817 aesmc v9.16b,v9.16b 1818 aese v10.16b,v2.16b 1819 aesmc v10.16b,v10.16b 1820 aese v11.16b,v2.16b 1821 aesmc v11.16b,v11.16b 1822 ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] 1823 1824 aese v24.16b,v3.16b 1825 .inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 1826 orr v24.16b,v0.16b,v0.16b 1827 aese v25.16b,v3.16b 1828 .inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b 1829 orr v25.16b,v0.16b,v0.16b 1830 aese v26.16b,v3.16b 1831 .inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b 1832 orr v26.16b,v0.16b,v0.16b 1833 aese v27.16b,v3.16b 1834 .inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b 1835 orr v27.16b,v0.16b,v0.16b 1836 aese v28.16b,v3.16b 1837 .inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b 1838 orr v28.16b,v0.16b,v0.16b 1839 aese v29.16b,v3.16b 1840 .inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b 1841 orr v29.16b,v0.16b,v0.16b 1842 aese v30.16b,v3.16b 1843 .inst 0xce017a52 //eor3 v18.16b,v18.16b,v1.16b,v30.16b 1844 orr v30.16b,v0.16b,v0.16b 1845 aese v31.16b,v3.16b 1846 .inst 0xce017e73 //eor3 v19.16b,v19.16b,v1.16b,v31.16b 1847 orr v31.16b,v0.16b,v0.16b 1848 aese v8.16b,v3.16b 1849 .inst 0xce012294 //eor3 v20.16b,v20.16b,v1.16b,v8.16b 1850 orr v8.16b,v0.16b,v0.16b 1851 aese v9.16b,v3.16b 1852 .inst 0xce0126b5 //eor3 v21.16b,v21.16b,v1.16b,v9.16b 1853 orr v9.16b,v0.16b,v0.16b 1854 aese v10.16b,v3.16b 1855 .inst 0xce012ad6 //eor3 v22.16b,v22.16b,v1.16b,v10.16b 1856 orr v10.16b,v0.16b,v0.16b 1857 aese v11.16b,v3.16b 1858 .inst 0xce012ef7 //eor3 v23.16b,v23.16b,v1.16b,v11.16b 1859 orr v11.16b,v0.16b,v0.16b 1860 ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] 1861 1862 mov v24.s[3],w9 1863 mov v25.s[3],w10 1864 mov v26.s[3],w12 1865 mov v27.s[3],w11 1866 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1867 mov v28.s[3],w13 1868 mov v29.s[3],w14 1869 mov v30.s[3],w15 1870 mov v31.s[3],w19 1871 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 1872 mov v8.s[3],w20 1873 mov v9.s[3],w21 1874 mov v10.s[3],w22 1875 mov v11.s[3],w23 1876 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1877 1878 mov w6,w5 1879 1880 add w8,w8,#12 1881 subs x2,x2,#12 1882 b.hs .Loop12x_ctr32_unroll 1883 1884 // pop regs from stack when 12 data chunks are interleaved 1885 ldp d10,d11,[sp],#32 1886 ldp d8,d9,[sp],#32 1887 ldp x23,x24,[sp],#16 1888 ldp x21,x22,[sp],#16 1889 ldp x19,x20,[sp],#16 1890 1891 add x2,x2,#12 1892 cbz x2,.Lctr32_done_unroll 1893 sub w8,w8,#12 1894 1895 cmp x2,#2 1896 b.ls .Lctr32_tail_unroll 1897 1898 cmp x2,#6 1899 sub x2,x2,#3 // bias 1900 add w8,w8,#3 1901 b.lo .Loop3x_ctr32_unroll 1902 1903 sub x2,x2,#3 1904 add w8,w8,#3 1905 b.lo .Loop6x_ctr32_unroll 1906 1907 .align 4 1908 .Loop6x_ctr32_unroll: 1909 aese v24.16b,v2.16b 1910 aesmc v24.16b,v24.16b 1911 aese v25.16b,v2.16b 1912 aesmc v25.16b,v25.16b 1913 aese v26.16b,v2.16b 1914 aesmc v26.16b,v26.16b 1915 aese v27.16b,v2.16b 1916 aesmc v27.16b,v27.16b 1917 aese v28.16b,v2.16b 1918 aesmc v28.16b,v28.16b 1919 aese v29.16b,v2.16b 1920 aesmc v29.16b,v29.16b 1921 ld1 {v2.4s},[x7],#16 1922 subs w6,w6,#2 1923 aese v24.16b,v3.16b 1924 aesmc v24.16b,v24.16b 1925 aese v25.16b,v3.16b 1926 aesmc v25.16b,v25.16b 1927 aese v26.16b,v3.16b 1928 aesmc v26.16b,v26.16b 1929 aese v27.16b,v3.16b 1930 aesmc v27.16b,v27.16b 1931 aese v28.16b,v3.16b 1932 aesmc v28.16b,v28.16b 1933 aese v29.16b,v3.16b 1934 aesmc v29.16b,v29.16b 1935 ld1 {v3.4s},[x7],#16 1936 b.gt .Loop6x_ctr32_unroll 1937 1938 aese v24.16b,v2.16b 1939 aesmc v24.16b,v24.16b 1940 aese v25.16b,v2.16b 1941 aesmc v25.16b,v25.16b 1942 aese v26.16b,v2.16b 1943 aesmc v26.16b,v26.16b 1944 aese v27.16b,v2.16b 1945 aesmc v27.16b,v27.16b 1946 aese v28.16b,v2.16b 1947 aesmc v28.16b,v28.16b 1948 aese v29.16b,v2.16b 1949 aesmc v29.16b,v29.16b 1950 ld1 {v2.4s},[x7],#16 1951 1952 aese v24.16b,v3.16b 1953 aesmc v24.16b,v24.16b 1954 aese v25.16b,v3.16b 1955 aesmc v25.16b,v25.16b 1956 aese v26.16b,v3.16b 1957 aesmc v26.16b,v26.16b 1958 aese v27.16b,v3.16b 1959 aesmc v27.16b,v27.16b 1960 aese v28.16b,v3.16b 1961 aesmc v28.16b,v28.16b 1962 aese v29.16b,v3.16b 1963 aesmc v29.16b,v29.16b 1964 ld1 {v3.4s},[x7],#16 1965 1966 aese v24.16b,v2.16b 1967 aesmc v24.16b,v24.16b 1968 add w9,w8,#1 1969 add w10,w8,#2 1970 aese v25.16b,v2.16b 1971 aesmc v25.16b,v25.16b 1972 add w12,w8,#3 1973 add w11,w8,#4 1974 aese v26.16b,v2.16b 1975 aesmc v26.16b,v26.16b 1976 add w13,w8,#5 1977 add w14,w8,#6 1978 rev w9,w9 1979 aese v27.16b,v2.16b 1980 aesmc v27.16b,v27.16b 1981 rev w10,w10 1982 rev w12,w12 1983 aese v28.16b,v2.16b 1984 aesmc v28.16b,v28.16b 1985 rev w11,w11 1986 rev w13,w13 1987 aese v29.16b,v2.16b 1988 aesmc v29.16b,v29.16b 1989 rev w14,w14 1990 ld1 {v2.4s},[x7],#16 1991 1992 aese v24.16b,v3.16b 1993 aesmc v24.16b,v24.16b 1994 aese v25.16b,v3.16b 1995 aesmc v25.16b,v25.16b 1996 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1997 aese v26.16b,v3.16b 1998 aesmc v26.16b,v26.16b 1999 aese v27.16b,v3.16b 2000 aesmc v27.16b,v27.16b 2001 ld1 {v16.16b,v17.16b},[x0],#32 2002 aese v28.16b,v3.16b 2003 aesmc v28.16b,v28.16b 2004 aese v29.16b,v3.16b 2005 aesmc v29.16b,v29.16b 2006 ld1 {v3.4s},[x7],#16 2007 2008 mov x7, x3 2009 aese v24.16b,v2.16b 2010 aesmc v24.16b,v24.16b 2011 aese v25.16b,v2.16b 2012 aesmc v25.16b,v25.16b 2013 aese v26.16b,v2.16b 2014 aesmc v26.16b,v26.16b 2015 aese v27.16b,v2.16b 2016 aesmc v27.16b,v27.16b 2017 aese v28.16b,v2.16b 2018 aesmc v28.16b,v28.16b 2019 aese v29.16b,v2.16b 2020 aesmc v29.16b,v29.16b 2021 ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] 2022 2023 aese v24.16b,v3.16b 2024 .inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 2025 aese v25.16b,v3.16b 2026 .inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b 2027 aese v26.16b,v3.16b 2028 .inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b 2029 aese v27.16b,v3.16b 2030 .inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b 2031 aese v28.16b,v3.16b 2032 .inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b 2033 aese v29.16b,v3.16b 2034 .inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b 2035 ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] 2036 2037 orr v24.16b,v0.16b,v0.16b 2038 orr v25.16b,v0.16b,v0.16b 2039 orr v26.16b,v0.16b,v0.16b 2040 orr v27.16b,v0.16b,v0.16b 2041 orr v28.16b,v0.16b,v0.16b 2042 orr v29.16b,v0.16b,v0.16b 2043 2044 mov v24.s[3],w9 2045 mov v25.s[3],w10 2046 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 2047 mov v26.s[3],w12 2048 mov v27.s[3],w11 2049 st1 {v16.16b,v17.16b},[x1],#32 2050 mov v28.s[3],w13 2051 mov v29.s[3],w14 2052 2053 cbz x2,.Lctr32_done_unroll 2054 mov w6,w5 2055 2056 cmp x2,#2 2057 b.ls .Lctr32_tail_unroll 2058 2059 sub x2,x2,#3 // bias 2060 add w8,w8,#3 2061 b .Loop3x_ctr32_unroll 2062 2063 .align 4 2064 .Loop3x_ctr32_unroll: 2065 aese v24.16b,v2.16b 2066 aesmc v24.16b,v24.16b 2067 aese v25.16b,v2.16b 2068 aesmc v25.16b,v25.16b 2069 aese v26.16b,v2.16b 2070 aesmc v26.16b,v26.16b 2071 ld1 {v2.4s},[x7],#16 2072 subs w6,w6,#2 2073 aese v24.16b,v3.16b 2074 aesmc v24.16b,v24.16b 2075 aese v25.16b,v3.16b 2076 aesmc v25.16b,v25.16b 2077 aese v26.16b,v3.16b 2078 aesmc v26.16b,v26.16b 2079 ld1 {v3.4s},[x7],#16 2080 b.gt .Loop3x_ctr32_unroll 2081 2082 aese v24.16b,v2.16b 2083 aesmc v9.16b,v24.16b 2084 aese v25.16b,v2.16b 2085 aesmc v10.16b,v25.16b 2086 ld1 {v4.16b,v5.16b,v6.16b},[x0],#48 2087 orr v24.16b,v0.16b,v0.16b 2088 aese v26.16b,v2.16b 2089 aesmc v26.16b,v26.16b 2090 ld1 {v2.4s},[x7],#16 2091 orr v25.16b,v0.16b,v0.16b 2092 aese v9.16b,v3.16b 2093 aesmc v9.16b,v9.16b 2094 aese v10.16b,v3.16b 2095 aesmc v10.16b,v10.16b 2096 aese v26.16b,v3.16b 2097 aesmc v11.16b,v26.16b 2098 ld1 {v3.4s},[x7],#16 2099 orr v26.16b,v0.16b,v0.16b 2100 add w9,w8,#1 2101 aese v9.16b,v2.16b 2102 aesmc v9.16b,v9.16b 2103 aese v10.16b,v2.16b 2104 aesmc v10.16b,v10.16b 2105 add w10,w8,#2 2106 aese v11.16b,v2.16b 2107 aesmc v11.16b,v11.16b 2108 ld1 {v2.4s},[x7],#16 2109 add w8,w8,#3 2110 aese v9.16b,v3.16b 2111 aesmc v9.16b,v9.16b 2112 aese v10.16b,v3.16b 2113 aesmc v10.16b,v10.16b 2114 2115 rev w9,w9 2116 aese v11.16b,v3.16b 2117 aesmc v11.16b,v11.16b 2118 ld1 {v3.4s},[x7],#16 2119 mov v24.s[3], w9 2120 mov x7,x3 2121 rev w10,w10 2122 aese v9.16b,v2.16b 2123 aesmc v9.16b,v9.16b 2124 2125 aese v10.16b,v2.16b 2126 aesmc v10.16b,v10.16b 2127 mov v25.s[3], w10 2128 rev w12,w8 2129 aese v11.16b,v2.16b 2130 aesmc v11.16b,v11.16b 2131 mov v26.s[3], w12 2132 2133 aese v9.16b,v3.16b 2134 aese v10.16b,v3.16b 2135 aese v11.16b,v3.16b 2136 2137 .inst 0xce012484 //eor3 v4.16b,v4.16b,v1.16b,v9.16b 2138 ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] 2139 .inst 0xce0128a5 //eor3 v5.16b,v5.16b,v1.16b,v10.16b 2140 mov w6,w5 2141 .inst 0xce012cc6 //eor3 v6.16b,v6.16b,v1.16b,v11.16b 2142 ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] 2143 st1 {v4.16b,v5.16b,v6.16b},[x1],#48 2144 2145 cbz x2,.Lctr32_done_unroll 2146 2147 .Lctr32_tail_unroll: 2148 cmp x2,#1 2149 b.eq .Lctr32_tail_1_unroll 2150 2151 .Lctr32_tail_2_unroll: 2152 aese v24.16b,v2.16b 2153 aesmc v24.16b,v24.16b 2154 aese v25.16b,v2.16b 2155 aesmc v25.16b,v25.16b 2156 ld1 {v2.4s},[x7],#16 2157 subs w6,w6,#2 2158 aese v24.16b,v3.16b 2159 aesmc v24.16b,v24.16b 2160 aese v25.16b,v3.16b 2161 aesmc v25.16b,v25.16b 2162 ld1 {v3.4s},[x7],#16 2163 b.gt .Lctr32_tail_2_unroll 2164 2165 aese v24.16b,v2.16b 2166 aesmc v24.16b,v24.16b 2167 aese v25.16b,v2.16b 2168 aesmc v25.16b,v25.16b 2169 ld1 {v2.4s},[x7],#16 2170 aese v24.16b,v3.16b 2171 aesmc v24.16b,v24.16b 2172 aese v25.16b,v3.16b 2173 aesmc v25.16b,v25.16b 2174 ld1 {v3.4s},[x7],#16 2175 ld1 {v4.16b,v5.16b},[x0],#32 2176 aese v24.16b,v2.16b 2177 aesmc v24.16b,v24.16b 2178 aese v25.16b,v2.16b 2179 aesmc v25.16b,v25.16b 2180 ld1 {v2.4s},[x7],#16 2181 aese v24.16b,v3.16b 2182 aesmc v24.16b,v24.16b 2183 aese v25.16b,v3.16b 2184 aesmc v25.16b,v25.16b 2185 ld1 {v3.4s},[x7],#16 2186 aese v24.16b,v2.16b 2187 aesmc v24.16b,v24.16b 2188 aese v25.16b,v2.16b 2189 aesmc v25.16b,v25.16b 2190 aese v24.16b,v3.16b 2191 aese v25.16b,v3.16b 2192 2193 .inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 2194 .inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b 2195 st1 {v4.16b,v5.16b},[x1],#32 2196 b .Lctr32_done_unroll 2197 2198 .Lctr32_tail_1_unroll: 2199 aese v24.16b,v2.16b 2200 aesmc v24.16b,v24.16b 2201 ld1 {v2.4s},[x7],#16 2202 subs w6,w6,#2 2203 aese v24.16b,v3.16b 2204 aesmc v24.16b,v24.16b 2205 ld1 {v3.4s},[x7],#16 2206 b.gt .Lctr32_tail_1_unroll 2207 2208 aese v24.16b,v2.16b 2209 aesmc v24.16b,v24.16b 2210 ld1 {v2.4s},[x7],#16 2211 aese v24.16b,v3.16b 2212 aesmc v24.16b,v24.16b 2213 ld1 {v3.4s},[x7],#16 2214 ld1 {v4.16b},[x0] 2215 aese v24.16b,v2.16b 2216 aesmc v24.16b,v24.16b 2217 ld1 {v2.4s},[x7],#16 2218 aese v24.16b,v3.16b 2219 aesmc v24.16b,v24.16b 2220 ld1 {v3.4s},[x7],#16 2221 aese v24.16b,v2.16b 2222 aesmc v24.16b,v24.16b 2223 aese v24.16b,v3.16b 2224 2225 .inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 2226 st1 {v4.16b},[x1],#16 2227 2228 .Lctr32_done_unroll: 2229 ldp d8,d9,[sp, #16] 2230 ldp d10,d11,[sp, #32] 2231 ldp d12,d13,[sp, #48] 2232 ldp d14,d15,[sp, #64] 2233 ldr x29,[sp],#80 2234 ret 2235 .size aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3 2236 .globl aes_v8_ctr32_encrypt_blocks 2237 .type aes_v8_ctr32_encrypt_blocks,%function 2238 .align 5 2239 aes_v8_ctr32_encrypt_blocks: 2240 AARCH64_VALID_CALL_TARGET 2241 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 2242 stp x29,x30,[sp,#-16]! 2243 add x29,sp,#0 2244 ldr w5,[x3,#240] 2245 2246 ldr w8, [x4, #12] 2247 #ifdef __AARCH64EB__ 2248 ld1 {v0.16b},[x4] 2249 #else 2250 ld1 {v0.4s},[x4] 2251 #endif 2252 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 2253 sub w5,w5,#4 2254 mov x12,#16 2255 cmp x2,#2 2256 add x7,x3,x5,lsl#4 // pointer to last 5 round keys 2257 sub w5,w5,#2 2258 ld1 {v20.4s,v21.4s},[x7],#32 2259 ld1 {v22.4s,v23.4s},[x7],#32 2260 ld1 {v7.4s},[x7] 2261 add x7,x3,#32 2262 mov w6,w5 2263 csel x12,xzr,x12,lo 2264 #ifndef __AARCH64EB__ 2265 rev w8, w8 2266 #endif 2267 orr v1.16b,v0.16b,v0.16b 2268 add w10, w8, #1 2269 orr v18.16b,v0.16b,v0.16b 2270 add w8, w8, #2 2271 orr v6.16b,v0.16b,v0.16b 2272 rev w10, w10 2273 mov v1.s[3],w10 2274 b.ls .Lctr32_tail 2275 rev w12, w8 2276 sub x2,x2,#3 // bias 2277 mov v18.s[3],w12 2278 cmp x2,#32 2279 b.lo .Loop3x_ctr32 2280 2281 add w13,w8,#1 2282 add w14,w8,#2 2283 orr v24.16b,v0.16b,v0.16b 2284 rev w13,w13 2285 orr v25.16b,v0.16b,v0.16b 2286 rev w14,w14 2287 mov v24.s[3],w13 2288 sub x2,x2,#2 // bias 2289 mov v25.s[3],w14 2290 add w8,w8,#2 2291 b .Loop5x_ctr32 2292 2293 .align 4 2294 .Loop5x_ctr32: 2295 aese v0.16b,v16.16b 2296 aesmc v0.16b,v0.16b 2297 aese v1.16b,v16.16b 2298 aesmc v1.16b,v1.16b 2299 aese v18.16b,v16.16b 2300 aesmc v18.16b,v18.16b 2301 aese v24.16b,v16.16b 2302 aesmc v24.16b,v24.16b 2303 aese v25.16b,v16.16b 2304 aesmc v25.16b,v25.16b 2305 ld1 {v16.4s},[x7],#16 2306 subs w6,w6,#2 2307 aese v0.16b,v17.16b 2308 aesmc v0.16b,v0.16b 2309 aese v1.16b,v17.16b 2310 aesmc v1.16b,v1.16b 2311 aese v18.16b,v17.16b 2312 aesmc v18.16b,v18.16b 2313 aese v24.16b,v17.16b 2314 aesmc v24.16b,v24.16b 2315 aese v25.16b,v17.16b 2316 aesmc v25.16b,v25.16b 2317 ld1 {v17.4s},[x7],#16 2318 b.gt .Loop5x_ctr32 2319 2320 mov x7,x3 2321 aese v0.16b,v16.16b 2322 aesmc v0.16b,v0.16b 2323 aese v1.16b,v16.16b 2324 aesmc v1.16b,v1.16b 2325 aese v18.16b,v16.16b 2326 aesmc v18.16b,v18.16b 2327 aese v24.16b,v16.16b 2328 aesmc v24.16b,v24.16b 2329 aese v25.16b,v16.16b 2330 aesmc v25.16b,v25.16b 2331 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2332 2333 aese v0.16b,v17.16b 2334 aesmc v0.16b,v0.16b 2335 aese v1.16b,v17.16b 2336 aesmc v1.16b,v1.16b 2337 aese v18.16b,v17.16b 2338 aesmc v18.16b,v18.16b 2339 aese v24.16b,v17.16b 2340 aesmc v24.16b,v24.16b 2341 aese v25.16b,v17.16b 2342 aesmc v25.16b,v25.16b 2343 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2344 2345 aese v0.16b,v20.16b 2346 aesmc v0.16b,v0.16b 2347 add w9,w8,#1 2348 add w10,w8,#2 2349 aese v1.16b,v20.16b 2350 aesmc v1.16b,v1.16b 2351 add w12,w8,#3 2352 add w13,w8,#4 2353 aese v18.16b,v20.16b 2354 aesmc v18.16b,v18.16b 2355 add w14,w8,#5 2356 rev w9,w9 2357 aese v24.16b,v20.16b 2358 aesmc v24.16b,v24.16b 2359 rev w10,w10 2360 rev w12,w12 2361 aese v25.16b,v20.16b 2362 aesmc v25.16b,v25.16b 2363 rev w13,w13 2364 rev w14,w14 2365 2366 aese v0.16b,v21.16b 2367 aesmc v0.16b,v0.16b 2368 aese v1.16b,v21.16b 2369 aesmc v1.16b,v1.16b 2370 aese v18.16b,v21.16b 2371 aesmc v18.16b,v18.16b 2372 aese v24.16b,v21.16b 2373 aesmc v24.16b,v24.16b 2374 aese v25.16b,v21.16b 2375 aesmc v25.16b,v25.16b 2376 2377 aese v0.16b,v22.16b 2378 aesmc v0.16b,v0.16b 2379 ld1 {v2.16b},[x0],#16 2380 aese v1.16b,v22.16b 2381 aesmc v1.16b,v1.16b 2382 ld1 {v3.16b},[x0],#16 2383 aese v18.16b,v22.16b 2384 aesmc v18.16b,v18.16b 2385 ld1 {v19.16b},[x0],#16 2386 aese v24.16b,v22.16b 2387 aesmc v24.16b,v24.16b 2388 ld1 {v26.16b},[x0],#16 2389 aese v25.16b,v22.16b 2390 aesmc v25.16b,v25.16b 2391 ld1 {v27.16b},[x0],#16 2392 2393 aese v0.16b,v23.16b 2394 eor v2.16b,v2.16b,v7.16b 2395 aese v1.16b,v23.16b 2396 eor v3.16b,v3.16b,v7.16b 2397 aese v18.16b,v23.16b 2398 eor v19.16b,v19.16b,v7.16b 2399 aese v24.16b,v23.16b 2400 eor v26.16b,v26.16b,v7.16b 2401 aese v25.16b,v23.16b 2402 eor v27.16b,v27.16b,v7.16b 2403 2404 eor v2.16b,v2.16b,v0.16b 2405 orr v0.16b,v6.16b,v6.16b 2406 eor v3.16b,v3.16b,v1.16b 2407 orr v1.16b,v6.16b,v6.16b 2408 eor v19.16b,v19.16b,v18.16b 2409 orr v18.16b,v6.16b,v6.16b 2410 eor v26.16b,v26.16b,v24.16b 2411 orr v24.16b,v6.16b,v6.16b 2412 eor v27.16b,v27.16b,v25.16b 2413 orr v25.16b,v6.16b,v6.16b 2414 2415 st1 {v2.16b},[x1],#16 2416 mov v0.s[3],w9 2417 st1 {v3.16b},[x1],#16 2418 mov v1.s[3],w10 2419 st1 {v19.16b},[x1],#16 2420 mov v18.s[3],w12 2421 st1 {v26.16b},[x1],#16 2422 mov v24.s[3],w13 2423 st1 {v27.16b},[x1],#16 2424 mov v25.s[3],w14 2425 2426 mov w6,w5 2427 cbz x2,.Lctr32_done 2428 2429 add w8,w8,#5 2430 subs x2,x2,#5 2431 b.hs .Loop5x_ctr32 2432 2433 add x2,x2,#5 2434 sub w8,w8,#5 2435 2436 cmp x2,#2 2437 mov x12,#16 2438 csel x12,xzr,x12,lo 2439 b.ls .Lctr32_tail 2440 2441 sub x2,x2,#3 // bias 2442 add w8,w8,#3 2443 b .Loop3x_ctr32 2444 2445 .align 4 2446 .Loop3x_ctr32: 2447 aese v0.16b,v16.16b 2448 aesmc v0.16b,v0.16b 2449 aese v1.16b,v16.16b 2450 aesmc v1.16b,v1.16b 2451 aese v18.16b,v16.16b 2452 aesmc v18.16b,v18.16b 2453 ld1 {v16.4s},[x7],#16 2454 subs w6,w6,#2 2455 aese v0.16b,v17.16b 2456 aesmc v0.16b,v0.16b 2457 aese v1.16b,v17.16b 2458 aesmc v1.16b,v1.16b 2459 aese v18.16b,v17.16b 2460 aesmc v18.16b,v18.16b 2461 ld1 {v17.4s},[x7],#16 2462 b.gt .Loop3x_ctr32 2463 2464 aese v0.16b,v16.16b 2465 aesmc v4.16b,v0.16b 2466 aese v1.16b,v16.16b 2467 aesmc v5.16b,v1.16b 2468 ld1 {v2.16b},[x0],#16 2469 orr v0.16b,v6.16b,v6.16b 2470 aese v18.16b,v16.16b 2471 aesmc v18.16b,v18.16b 2472 ld1 {v3.16b},[x0],#16 2473 orr v1.16b,v6.16b,v6.16b 2474 aese v4.16b,v17.16b 2475 aesmc v4.16b,v4.16b 2476 aese v5.16b,v17.16b 2477 aesmc v5.16b,v5.16b 2478 ld1 {v19.16b},[x0],#16 2479 mov x7,x3 2480 aese v18.16b,v17.16b 2481 aesmc v17.16b,v18.16b 2482 orr v18.16b,v6.16b,v6.16b 2483 add w9,w8,#1 2484 aese v4.16b,v20.16b 2485 aesmc v4.16b,v4.16b 2486 aese v5.16b,v20.16b 2487 aesmc v5.16b,v5.16b 2488 eor v2.16b,v2.16b,v7.16b 2489 add w10,w8,#2 2490 aese v17.16b,v20.16b 2491 aesmc v17.16b,v17.16b 2492 eor v3.16b,v3.16b,v7.16b 2493 add w8,w8,#3 2494 aese v4.16b,v21.16b 2495 aesmc v4.16b,v4.16b 2496 aese v5.16b,v21.16b 2497 aesmc v5.16b,v5.16b 2498 eor v19.16b,v19.16b,v7.16b 2499 rev w9,w9 2500 aese v17.16b,v21.16b 2501 aesmc v17.16b,v17.16b 2502 mov v0.s[3], w9 2503 rev w10,w10 2504 aese v4.16b,v22.16b 2505 aesmc v4.16b,v4.16b 2506 aese v5.16b,v22.16b 2507 aesmc v5.16b,v5.16b 2508 mov v1.s[3], w10 2509 rev w12,w8 2510 aese v17.16b,v22.16b 2511 aesmc v17.16b,v17.16b 2512 mov v18.s[3], w12 2513 subs x2,x2,#3 2514 aese v4.16b,v23.16b 2515 aese v5.16b,v23.16b 2516 aese v17.16b,v23.16b 2517 2518 eor v2.16b,v2.16b,v4.16b 2519 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2520 st1 {v2.16b},[x1],#16 2521 eor v3.16b,v3.16b,v5.16b 2522 mov w6,w5 2523 st1 {v3.16b},[x1],#16 2524 eor v19.16b,v19.16b,v17.16b 2525 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2526 st1 {v19.16b},[x1],#16 2527 b.hs .Loop3x_ctr32 2528 2529 adds x2,x2,#3 2530 b.eq .Lctr32_done 2531 cmp x2,#1 2532 mov x12,#16 2533 csel x12,xzr,x12,eq 2534 2535 .Lctr32_tail: 2536 aese v0.16b,v16.16b 2537 aesmc v0.16b,v0.16b 2538 aese v1.16b,v16.16b 2539 aesmc v1.16b,v1.16b 2540 ld1 {v16.4s},[x7],#16 2541 subs w6,w6,#2 2542 aese v0.16b,v17.16b 2543 aesmc v0.16b,v0.16b 2544 aese v1.16b,v17.16b 2545 aesmc v1.16b,v1.16b 2546 ld1 {v17.4s},[x7],#16 2547 b.gt .Lctr32_tail 2548 2549 aese v0.16b,v16.16b 2550 aesmc v0.16b,v0.16b 2551 aese v1.16b,v16.16b 2552 aesmc v1.16b,v1.16b 2553 aese v0.16b,v17.16b 2554 aesmc v0.16b,v0.16b 2555 aese v1.16b,v17.16b 2556 aesmc v1.16b,v1.16b 2557 ld1 {v2.16b},[x0],x12 2558 aese v0.16b,v20.16b 2559 aesmc v0.16b,v0.16b 2560 aese v1.16b,v20.16b 2561 aesmc v1.16b,v1.16b 2562 ld1 {v3.16b},[x0] 2563 aese v0.16b,v21.16b 2564 aesmc v0.16b,v0.16b 2565 aese v1.16b,v21.16b 2566 aesmc v1.16b,v1.16b 2567 eor v2.16b,v2.16b,v7.16b 2568 aese v0.16b,v22.16b 2569 aesmc v0.16b,v0.16b 2570 aese v1.16b,v22.16b 2571 aesmc v1.16b,v1.16b 2572 eor v3.16b,v3.16b,v7.16b 2573 aese v0.16b,v23.16b 2574 aese v1.16b,v23.16b 2575 2576 cmp x2,#1 2577 eor v2.16b,v2.16b,v0.16b 2578 eor v3.16b,v3.16b,v1.16b 2579 st1 {v2.16b},[x1],#16 2580 b.eq .Lctr32_done 2581 st1 {v3.16b},[x1] 2582 2583 .Lctr32_done: 2584 ldr x29,[sp],#16 2585 ret 2586 .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks 2587 .globl aes_v8_xts_encrypt 2588 .type aes_v8_xts_encrypt,%function 2589 .align 5 2590 aes_v8_xts_encrypt: 2591 AARCH64_VALID_CALL_TARGET 2592 cmp x2,#16 2593 // Original input data size bigger than 16, jump to big size processing. 2594 b.ne .Lxts_enc_big_size 2595 // Encrypt the iv with key2, as the first XEX iv. 2596 ldr w6,[x4,#240] 2597 ld1 {v0.4s},[x4],#16 2598 ld1 {v6.16b},[x5] 2599 sub w6,w6,#2 2600 ld1 {v1.4s},[x4],#16 2601 2602 .Loop_enc_iv_enc: 2603 aese v6.16b,v0.16b 2604 aesmc v6.16b,v6.16b 2605 ld1 {v0.4s},[x4],#16 2606 subs w6,w6,#2 2607 aese v6.16b,v1.16b 2608 aesmc v6.16b,v6.16b 2609 ld1 {v1.4s},[x4],#16 2610 b.gt .Loop_enc_iv_enc 2611 2612 aese v6.16b,v0.16b 2613 aesmc v6.16b,v6.16b 2614 ld1 {v0.4s},[x4] 2615 aese v6.16b,v1.16b 2616 eor v6.16b,v6.16b,v0.16b 2617 2618 ld1 {v0.16b},[x0] 2619 eor v0.16b,v6.16b,v0.16b 2620 2621 ldr w6,[x3,#240] 2622 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 2623 2624 aese v0.16b,v28.16b 2625 aesmc v0.16b,v0.16b 2626 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 2627 aese v0.16b,v29.16b 2628 aesmc v0.16b,v0.16b 2629 subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing 2630 b.eq .Lxts_128_enc 2631 .Lxts_enc_round_loop: 2632 aese v0.16b,v16.16b 2633 aesmc v0.16b,v0.16b 2634 ld1 {v16.4s},[x3],#16 // load key schedule... 2635 aese v0.16b,v17.16b 2636 aesmc v0.16b,v0.16b 2637 ld1 {v17.4s},[x3],#16 // load key schedule... 2638 subs w6,w6,#2 // bias 2639 b.gt .Lxts_enc_round_loop 2640 .Lxts_128_enc: 2641 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 2642 aese v0.16b,v16.16b 2643 aesmc v0.16b,v0.16b 2644 aese v0.16b,v17.16b 2645 aesmc v0.16b,v0.16b 2646 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 2647 aese v0.16b,v18.16b 2648 aesmc v0.16b,v0.16b 2649 aese v0.16b,v19.16b 2650 aesmc v0.16b,v0.16b 2651 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 2652 aese v0.16b,v20.16b 2653 aesmc v0.16b,v0.16b 2654 aese v0.16b,v21.16b 2655 aesmc v0.16b,v0.16b 2656 ld1 {v7.4s},[x3] 2657 aese v0.16b,v22.16b 2658 aesmc v0.16b,v0.16b 2659 aese v0.16b,v23.16b 2660 eor v0.16b,v0.16b,v7.16b 2661 eor v0.16b,v0.16b,v6.16b 2662 st1 {v0.16b},[x1] 2663 b .Lxts_enc_final_abort 2664 2665 .align 4 2666 .Lxts_enc_big_size: 2667 stp x19,x20,[sp,#-64]! 2668 stp x21,x22,[sp,#48] 2669 stp d8,d9,[sp,#32] 2670 stp d10,d11,[sp,#16] 2671 2672 // tailcnt store the tail value of length%16. 2673 and x21,x2,#0xf 2674 and x2,x2,#-16 2675 subs x2,x2,#16 2676 mov x8,#16 2677 b.lo .Lxts_abort 2678 csel x8,xzr,x8,eq 2679 2680 // Firstly, encrypt the iv with key2, as the first iv of XEX. 2681 ldr w6,[x4,#240] 2682 ld1 {v0.4s},[x4],#16 2683 ld1 {v6.16b},[x5] 2684 sub w6,w6,#2 2685 ld1 {v1.4s},[x4],#16 2686 2687 .Loop_iv_enc: 2688 aese v6.16b,v0.16b 2689 aesmc v6.16b,v6.16b 2690 ld1 {v0.4s},[x4],#16 2691 subs w6,w6,#2 2692 aese v6.16b,v1.16b 2693 aesmc v6.16b,v6.16b 2694 ld1 {v1.4s},[x4],#16 2695 b.gt .Loop_iv_enc 2696 2697 aese v6.16b,v0.16b 2698 aesmc v6.16b,v6.16b 2699 ld1 {v0.4s},[x4] 2700 aese v6.16b,v1.16b 2701 eor v6.16b,v6.16b,v0.16b 2702 2703 // The iv for second block 2704 // x9- iv(low), x10 - iv(high) 2705 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 2706 fmov x9,d6 2707 fmov x10,v6.d[1] 2708 mov w19,#0x87 2709 extr x22,x10,x10,#32 2710 extr x10,x10,x9,#63 2711 and w11,w19,w22,asr#31 2712 eor x9,x11,x9,lsl#1 2713 fmov d8,x9 2714 fmov v8.d[1],x10 2715 2716 ldr w5,[x3,#240] // next starting point 2717 ld1 {v0.16b},[x0],x8 2718 2719 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 2720 sub w5,w5,#6 2721 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 2722 sub w5,w5,#2 2723 ld1 {v18.4s,v19.4s},[x7],#32 2724 ld1 {v20.4s,v21.4s},[x7],#32 2725 ld1 {v22.4s,v23.4s},[x7],#32 2726 ld1 {v7.4s},[x7] 2727 2728 add x7,x3,#32 2729 mov w6,w5 2730 2731 // Encryption 2732 .Lxts_enc: 2733 ld1 {v24.16b},[x0],#16 2734 subs x2,x2,#32 // bias 2735 add w6,w5,#2 2736 orr v3.16b,v0.16b,v0.16b 2737 orr v1.16b,v0.16b,v0.16b 2738 orr v28.16b,v0.16b,v0.16b 2739 orr v27.16b,v24.16b,v24.16b 2740 orr v29.16b,v24.16b,v24.16b 2741 b.lo .Lxts_inner_enc_tail 2742 eor v0.16b,v0.16b,v6.16b // before encryption, xor with iv 2743 eor v24.16b,v24.16b,v8.16b 2744 2745 // The iv for third block 2746 extr x22,x10,x10,#32 2747 extr x10,x10,x9,#63 2748 and w11,w19,w22,asr#31 2749 eor x9,x11,x9,lsl#1 2750 fmov d9,x9 2751 fmov v9.d[1],x10 2752 2753 2754 orr v1.16b,v24.16b,v24.16b 2755 ld1 {v24.16b},[x0],#16 2756 orr v2.16b,v0.16b,v0.16b 2757 orr v3.16b,v1.16b,v1.16b 2758 eor v27.16b,v24.16b,v9.16b // the third block 2759 eor v24.16b,v24.16b,v9.16b 2760 cmp x2,#32 2761 b.lo .Lxts_outer_enc_tail 2762 2763 // The iv for fourth block 2764 extr x22,x10,x10,#32 2765 extr x10,x10,x9,#63 2766 and w11,w19,w22,asr#31 2767 eor x9,x11,x9,lsl#1 2768 fmov d10,x9 2769 fmov v10.d[1],x10 2770 2771 ld1 {v25.16b},[x0],#16 2772 // The iv for fifth block 2773 extr x22,x10,x10,#32 2774 extr x10,x10,x9,#63 2775 and w11,w19,w22,asr#31 2776 eor x9,x11,x9,lsl#1 2777 fmov d11,x9 2778 fmov v11.d[1],x10 2779 2780 ld1 {v26.16b},[x0],#16 2781 eor v25.16b,v25.16b,v10.16b // the fourth block 2782 eor v26.16b,v26.16b,v11.16b 2783 sub x2,x2,#32 // bias 2784 mov w6,w5 2785 b .Loop5x_xts_enc 2786 2787 .align 4 2788 .Loop5x_xts_enc: 2789 aese v0.16b,v16.16b 2790 aesmc v0.16b,v0.16b 2791 aese v1.16b,v16.16b 2792 aesmc v1.16b,v1.16b 2793 aese v24.16b,v16.16b 2794 aesmc v24.16b,v24.16b 2795 aese v25.16b,v16.16b 2796 aesmc v25.16b,v25.16b 2797 aese v26.16b,v16.16b 2798 aesmc v26.16b,v26.16b 2799 ld1 {v16.4s},[x7],#16 2800 subs w6,w6,#2 2801 aese v0.16b,v17.16b 2802 aesmc v0.16b,v0.16b 2803 aese v1.16b,v17.16b 2804 aesmc v1.16b,v1.16b 2805 aese v24.16b,v17.16b 2806 aesmc v24.16b,v24.16b 2807 aese v25.16b,v17.16b 2808 aesmc v25.16b,v25.16b 2809 aese v26.16b,v17.16b 2810 aesmc v26.16b,v26.16b 2811 ld1 {v17.4s},[x7],#16 2812 b.gt .Loop5x_xts_enc 2813 2814 aese v0.16b,v16.16b 2815 aesmc v0.16b,v0.16b 2816 aese v1.16b,v16.16b 2817 aesmc v1.16b,v1.16b 2818 aese v24.16b,v16.16b 2819 aesmc v24.16b,v24.16b 2820 aese v25.16b,v16.16b 2821 aesmc v25.16b,v25.16b 2822 aese v26.16b,v16.16b 2823 aesmc v26.16b,v26.16b 2824 subs x2,x2,#0x50 // because .Lxts_enc_tail4x 2825 2826 aese v0.16b,v17.16b 2827 aesmc v0.16b,v0.16b 2828 aese v1.16b,v17.16b 2829 aesmc v1.16b,v1.16b 2830 aese v24.16b,v17.16b 2831 aesmc v24.16b,v24.16b 2832 aese v25.16b,v17.16b 2833 aesmc v25.16b,v25.16b 2834 aese v26.16b,v17.16b 2835 aesmc v26.16b,v26.16b 2836 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 2837 mov x7,x3 2838 2839 aese v0.16b,v18.16b 2840 aesmc v0.16b,v0.16b 2841 aese v1.16b,v18.16b 2842 aesmc v1.16b,v1.16b 2843 aese v24.16b,v18.16b 2844 aesmc v24.16b,v24.16b 2845 aese v25.16b,v18.16b 2846 aesmc v25.16b,v25.16b 2847 aese v26.16b,v18.16b 2848 aesmc v26.16b,v26.16b 2849 add x0,x0,x6 // x0 is adjusted in such way that 2850 // at exit from the loop v1.16b-v26.16b 2851 // are loaded with last "words" 2852 add x6,x2,#0x60 // because .Lxts_enc_tail4x 2853 2854 aese v0.16b,v19.16b 2855 aesmc v0.16b,v0.16b 2856 aese v1.16b,v19.16b 2857 aesmc v1.16b,v1.16b 2858 aese v24.16b,v19.16b 2859 aesmc v24.16b,v24.16b 2860 aese v25.16b,v19.16b 2861 aesmc v25.16b,v25.16b 2862 aese v26.16b,v19.16b 2863 aesmc v26.16b,v26.16b 2864 2865 aese v0.16b,v20.16b 2866 aesmc v0.16b,v0.16b 2867 aese v1.16b,v20.16b 2868 aesmc v1.16b,v1.16b 2869 aese v24.16b,v20.16b 2870 aesmc v24.16b,v24.16b 2871 aese v25.16b,v20.16b 2872 aesmc v25.16b,v25.16b 2873 aese v26.16b,v20.16b 2874 aesmc v26.16b,v26.16b 2875 2876 aese v0.16b,v21.16b 2877 aesmc v0.16b,v0.16b 2878 aese v1.16b,v21.16b 2879 aesmc v1.16b,v1.16b 2880 aese v24.16b,v21.16b 2881 aesmc v24.16b,v24.16b 2882 aese v25.16b,v21.16b 2883 aesmc v25.16b,v25.16b 2884 aese v26.16b,v21.16b 2885 aesmc v26.16b,v26.16b 2886 2887 aese v0.16b,v22.16b 2888 aesmc v0.16b,v0.16b 2889 aese v1.16b,v22.16b 2890 aesmc v1.16b,v1.16b 2891 aese v24.16b,v22.16b 2892 aesmc v24.16b,v24.16b 2893 aese v25.16b,v22.16b 2894 aesmc v25.16b,v25.16b 2895 aese v26.16b,v22.16b 2896 aesmc v26.16b,v26.16b 2897 2898 eor v4.16b,v7.16b,v6.16b 2899 aese v0.16b,v23.16b 2900 // The iv for first block of one iteration 2901 extr x22,x10,x10,#32 2902 extr x10,x10,x9,#63 2903 and w11,w19,w22,asr#31 2904 eor x9,x11,x9,lsl#1 2905 fmov d6,x9 2906 fmov v6.d[1],x10 2907 eor v5.16b,v7.16b,v8.16b 2908 ld1 {v2.16b},[x0],#16 2909 aese v1.16b,v23.16b 2910 // The iv for second block 2911 extr x22,x10,x10,#32 2912 extr x10,x10,x9,#63 2913 and w11,w19,w22,asr#31 2914 eor x9,x11,x9,lsl#1 2915 fmov d8,x9 2916 fmov v8.d[1],x10 2917 eor v17.16b,v7.16b,v9.16b 2918 ld1 {v3.16b},[x0],#16 2919 aese v24.16b,v23.16b 2920 // The iv for third block 2921 extr x22,x10,x10,#32 2922 extr x10,x10,x9,#63 2923 and w11,w19,w22,asr#31 2924 eor x9,x11,x9,lsl#1 2925 fmov d9,x9 2926 fmov v9.d[1],x10 2927 eor v30.16b,v7.16b,v10.16b 2928 ld1 {v27.16b},[x0],#16 2929 aese v25.16b,v23.16b 2930 // The iv for fourth block 2931 extr x22,x10,x10,#32 2932 extr x10,x10,x9,#63 2933 and w11,w19,w22,asr#31 2934 eor x9,x11,x9,lsl#1 2935 fmov d10,x9 2936 fmov v10.d[1],x10 2937 eor v31.16b,v7.16b,v11.16b 2938 ld1 {v28.16b},[x0],#16 2939 aese v26.16b,v23.16b 2940 2941 // The iv for fifth block 2942 extr x22,x10,x10,#32 2943 extr x10,x10,x9,#63 2944 and w11,w19,w22,asr #31 2945 eor x9,x11,x9,lsl #1 2946 fmov d11,x9 2947 fmov v11.d[1],x10 2948 2949 ld1 {v29.16b},[x0],#16 2950 cbz x6,.Lxts_enc_tail4x 2951 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2952 eor v4.16b,v4.16b,v0.16b 2953 eor v0.16b,v2.16b,v6.16b 2954 eor v5.16b,v5.16b,v1.16b 2955 eor v1.16b,v3.16b,v8.16b 2956 eor v17.16b,v17.16b,v24.16b 2957 eor v24.16b,v27.16b,v9.16b 2958 eor v30.16b,v30.16b,v25.16b 2959 eor v25.16b,v28.16b,v10.16b 2960 eor v31.16b,v31.16b,v26.16b 2961 st1 {v4.16b},[x1],#16 2962 eor v26.16b,v29.16b,v11.16b 2963 st1 {v5.16b},[x1],#16 2964 mov w6,w5 2965 st1 {v17.16b},[x1],#16 2966 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2967 st1 {v30.16b},[x1],#16 2968 st1 {v31.16b},[x1],#16 2969 b.hs .Loop5x_xts_enc 2970 2971 2972 // If left 4 blocks, borrow the five block's processing. 2973 cmn x2,#0x10 2974 b.ne .Loop5x_enc_after 2975 orr v11.16b,v10.16b,v10.16b 2976 orr v10.16b,v9.16b,v9.16b 2977 orr v9.16b,v8.16b,v8.16b 2978 orr v8.16b,v6.16b,v6.16b 2979 fmov x9,d11 2980 fmov x10,v11.d[1] 2981 eor v0.16b,v6.16b,v2.16b 2982 eor v1.16b,v8.16b,v3.16b 2983 eor v24.16b,v27.16b,v9.16b 2984 eor v25.16b,v28.16b,v10.16b 2985 eor v26.16b,v29.16b,v11.16b 2986 b.eq .Loop5x_xts_enc 2987 2988 .Loop5x_enc_after: 2989 add x2,x2,#0x50 2990 cbz x2,.Lxts_enc_done 2991 2992 add w6,w5,#2 2993 subs x2,x2,#0x30 2994 b.lo .Lxts_inner_enc_tail 2995 2996 eor v0.16b,v6.16b,v27.16b 2997 eor v1.16b,v8.16b,v28.16b 2998 eor v24.16b,v29.16b,v9.16b 2999 b .Lxts_outer_enc_tail 3000 3001 .align 4 3002 .Lxts_enc_tail4x: 3003 add x0,x0,#16 3004 eor v5.16b,v1.16b,v5.16b 3005 st1 {v5.16b},[x1],#16 3006 eor v17.16b,v24.16b,v17.16b 3007 st1 {v17.16b},[x1],#16 3008 eor v30.16b,v25.16b,v30.16b 3009 eor v31.16b,v26.16b,v31.16b 3010 st1 {v30.16b,v31.16b},[x1],#32 3011 3012 b .Lxts_enc_done 3013 .align 4 3014 .Lxts_outer_enc_tail: 3015 aese v0.16b,v16.16b 3016 aesmc v0.16b,v0.16b 3017 aese v1.16b,v16.16b 3018 aesmc v1.16b,v1.16b 3019 aese v24.16b,v16.16b 3020 aesmc v24.16b,v24.16b 3021 ld1 {v16.4s},[x7],#16 3022 subs w6,w6,#2 3023 aese v0.16b,v17.16b 3024 aesmc v0.16b,v0.16b 3025 aese v1.16b,v17.16b 3026 aesmc v1.16b,v1.16b 3027 aese v24.16b,v17.16b 3028 aesmc v24.16b,v24.16b 3029 ld1 {v17.4s},[x7],#16 3030 b.gt .Lxts_outer_enc_tail 3031 3032 aese v0.16b,v16.16b 3033 aesmc v0.16b,v0.16b 3034 aese v1.16b,v16.16b 3035 aesmc v1.16b,v1.16b 3036 aese v24.16b,v16.16b 3037 aesmc v24.16b,v24.16b 3038 eor v4.16b,v6.16b,v7.16b 3039 subs x2,x2,#0x30 3040 // The iv for first block 3041 fmov x9,d9 3042 fmov x10,v9.d[1] 3043 //mov w19,#0x87 3044 extr x22,x10,x10,#32 3045 extr x10,x10,x9,#63 3046 and w11,w19,w22,asr#31 3047 eor x9,x11,x9,lsl#1 3048 fmov d6,x9 3049 fmov v6.d[1],x10 3050 eor v5.16b,v8.16b,v7.16b 3051 csel x6,x2,x6,lo // x6, w6, is zero at this point 3052 aese v0.16b,v17.16b 3053 aesmc v0.16b,v0.16b 3054 aese v1.16b,v17.16b 3055 aesmc v1.16b,v1.16b 3056 aese v24.16b,v17.16b 3057 aesmc v24.16b,v24.16b 3058 eor v17.16b,v9.16b,v7.16b 3059 3060 add x6,x6,#0x20 3061 add x0,x0,x6 3062 mov x7,x3 3063 3064 aese v0.16b,v20.16b 3065 aesmc v0.16b,v0.16b 3066 aese v1.16b,v20.16b 3067 aesmc v1.16b,v1.16b 3068 aese v24.16b,v20.16b 3069 aesmc v24.16b,v24.16b 3070 aese v0.16b,v21.16b 3071 aesmc v0.16b,v0.16b 3072 aese v1.16b,v21.16b 3073 aesmc v1.16b,v1.16b 3074 aese v24.16b,v21.16b 3075 aesmc v24.16b,v24.16b 3076 aese v0.16b,v22.16b 3077 aesmc v0.16b,v0.16b 3078 aese v1.16b,v22.16b 3079 aesmc v1.16b,v1.16b 3080 aese v24.16b,v22.16b 3081 aesmc v24.16b,v24.16b 3082 aese v0.16b,v23.16b 3083 aese v1.16b,v23.16b 3084 aese v24.16b,v23.16b 3085 ld1 {v27.16b},[x0],#16 3086 add w6,w5,#2 3087 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3088 eor v4.16b,v4.16b,v0.16b 3089 eor v5.16b,v5.16b,v1.16b 3090 eor v24.16b,v24.16b,v17.16b 3091 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3092 st1 {v4.16b},[x1],#16 3093 st1 {v5.16b},[x1],#16 3094 st1 {v24.16b},[x1],#16 3095 cmn x2,#0x30 3096 b.eq .Lxts_enc_done 3097 .Lxts_encxor_one: 3098 orr v28.16b,v3.16b,v3.16b 3099 orr v29.16b,v27.16b,v27.16b 3100 nop 3101 3102 .Lxts_inner_enc_tail: 3103 cmn x2,#0x10 3104 eor v1.16b,v28.16b,v6.16b 3105 eor v24.16b,v29.16b,v8.16b 3106 b.eq .Lxts_enc_tail_loop 3107 eor v24.16b,v29.16b,v6.16b 3108 .Lxts_enc_tail_loop: 3109 aese v1.16b,v16.16b 3110 aesmc v1.16b,v1.16b 3111 aese v24.16b,v16.16b 3112 aesmc v24.16b,v24.16b 3113 ld1 {v16.4s},[x7],#16 3114 subs w6,w6,#2 3115 aese v1.16b,v17.16b 3116 aesmc v1.16b,v1.16b 3117 aese v24.16b,v17.16b 3118 aesmc v24.16b,v24.16b 3119 ld1 {v17.4s},[x7],#16 3120 b.gt .Lxts_enc_tail_loop 3121 3122 aese v1.16b,v16.16b 3123 aesmc v1.16b,v1.16b 3124 aese v24.16b,v16.16b 3125 aesmc v24.16b,v24.16b 3126 aese v1.16b,v17.16b 3127 aesmc v1.16b,v1.16b 3128 aese v24.16b,v17.16b 3129 aesmc v24.16b,v24.16b 3130 aese v1.16b,v20.16b 3131 aesmc v1.16b,v1.16b 3132 aese v24.16b,v20.16b 3133 aesmc v24.16b,v24.16b 3134 cmn x2,#0x20 3135 aese v1.16b,v21.16b 3136 aesmc v1.16b,v1.16b 3137 aese v24.16b,v21.16b 3138 aesmc v24.16b,v24.16b 3139 eor v5.16b,v6.16b,v7.16b 3140 aese v1.16b,v22.16b 3141 aesmc v1.16b,v1.16b 3142 aese v24.16b,v22.16b 3143 aesmc v24.16b,v24.16b 3144 eor v17.16b,v8.16b,v7.16b 3145 aese v1.16b,v23.16b 3146 aese v24.16b,v23.16b 3147 b.eq .Lxts_enc_one 3148 eor v5.16b,v5.16b,v1.16b 3149 st1 {v5.16b},[x1],#16 3150 eor v17.16b,v17.16b,v24.16b 3151 orr v6.16b,v8.16b,v8.16b 3152 st1 {v17.16b},[x1],#16 3153 fmov x9,d8 3154 fmov x10,v8.d[1] 3155 mov w19,#0x87 3156 extr x22,x10,x10,#32 3157 extr x10,x10,x9,#63 3158 and w11,w19,w22,asr #31 3159 eor x9,x11,x9,lsl #1 3160 fmov d6,x9 3161 fmov v6.d[1],x10 3162 b .Lxts_enc_done 3163 3164 .Lxts_enc_one: 3165 eor v5.16b,v5.16b,v24.16b 3166 orr v6.16b,v6.16b,v6.16b 3167 st1 {v5.16b},[x1],#16 3168 fmov x9,d6 3169 fmov x10,v6.d[1] 3170 mov w19,#0x87 3171 extr x22,x10,x10,#32 3172 extr x10,x10,x9,#63 3173 and w11,w19,w22,asr #31 3174 eor x9,x11,x9,lsl #1 3175 fmov d6,x9 3176 fmov v6.d[1],x10 3177 b .Lxts_enc_done 3178 .align 5 3179 .Lxts_enc_done: 3180 // Process the tail block with cipher stealing. 3181 tst x21,#0xf 3182 b.eq .Lxts_abort 3183 3184 mov x20,x0 3185 mov x13,x1 3186 sub x1,x1,#16 3187 .composite_enc_loop: 3188 subs x21,x21,#1 3189 ldrb w15,[x1,x21] 3190 ldrb w14,[x20,x21] 3191 strb w15,[x13,x21] 3192 strb w14,[x1,x21] 3193 b.gt .composite_enc_loop 3194 .Lxts_enc_load_done: 3195 ld1 {v26.16b},[x1] 3196 eor v26.16b,v26.16b,v6.16b 3197 3198 // Encrypt the composite block to get the last second encrypted text block 3199 ldr w6,[x3,#240] // load key schedule... 3200 ld1 {v0.4s},[x3],#16 3201 sub w6,w6,#2 3202 ld1 {v1.4s},[x3],#16 // load key schedule... 3203 .Loop_final_enc: 3204 aese v26.16b,v0.16b 3205 aesmc v26.16b,v26.16b 3206 ld1 {v0.4s},[x3],#16 3207 subs w6,w6,#2 3208 aese v26.16b,v1.16b 3209 aesmc v26.16b,v26.16b 3210 ld1 {v1.4s},[x3],#16 3211 b.gt .Loop_final_enc 3212 3213 aese v26.16b,v0.16b 3214 aesmc v26.16b,v26.16b 3215 ld1 {v0.4s},[x3] 3216 aese v26.16b,v1.16b 3217 eor v26.16b,v26.16b,v0.16b 3218 eor v26.16b,v26.16b,v6.16b 3219 st1 {v26.16b},[x1] 3220 3221 .Lxts_abort: 3222 ldp x21,x22,[sp,#48] 3223 ldp d8,d9,[sp,#32] 3224 ldp d10,d11,[sp,#16] 3225 ldp x19,x20,[sp],#64 3226 .Lxts_enc_final_abort: 3227 ret 3228 .size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt 3229 .globl aes_v8_xts_decrypt 3230 .type aes_v8_xts_decrypt,%function 3231 .align 5 3232 aes_v8_xts_decrypt: 3233 AARCH64_VALID_CALL_TARGET 3234 cmp x2,#16 3235 // Original input data size bigger than 16, jump to big size processing. 3236 b.ne .Lxts_dec_big_size 3237 // Encrypt the iv with key2, as the first XEX iv. 3238 ldr w6,[x4,#240] 3239 ld1 {v0.4s},[x4],#16 3240 ld1 {v6.16b},[x5] 3241 sub w6,w6,#2 3242 ld1 {v1.4s},[x4],#16 3243 3244 .Loop_dec_small_iv_enc: 3245 aese v6.16b,v0.16b 3246 aesmc v6.16b,v6.16b 3247 ld1 {v0.4s},[x4],#16 3248 subs w6,w6,#2 3249 aese v6.16b,v1.16b 3250 aesmc v6.16b,v6.16b 3251 ld1 {v1.4s},[x4],#16 3252 b.gt .Loop_dec_small_iv_enc 3253 3254 aese v6.16b,v0.16b 3255 aesmc v6.16b,v6.16b 3256 ld1 {v0.4s},[x4] 3257 aese v6.16b,v1.16b 3258 eor v6.16b,v6.16b,v0.16b 3259 3260 ld1 {v0.16b},[x0] 3261 eor v0.16b,v6.16b,v0.16b 3262 3263 ldr w6,[x3,#240] 3264 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 3265 3266 aesd v0.16b,v28.16b 3267 aesimc v0.16b,v0.16b 3268 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 3269 aesd v0.16b,v29.16b 3270 aesimc v0.16b,v0.16b 3271 subs w6,w6,#10 // bias 3272 b.eq .Lxts_128_dec 3273 .Lxts_dec_round_loop: 3274 aesd v0.16b,v16.16b 3275 aesimc v0.16b,v0.16b 3276 ld1 {v16.4s},[x3],#16 // load key schedule... 3277 aesd v0.16b,v17.16b 3278 aesimc v0.16b,v0.16b 3279 ld1 {v17.4s},[x3],#16 // load key schedule... 3280 subs w6,w6,#2 // bias 3281 b.gt .Lxts_dec_round_loop 3282 .Lxts_128_dec: 3283 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 3284 aesd v0.16b,v16.16b 3285 aesimc v0.16b,v0.16b 3286 aesd v0.16b,v17.16b 3287 aesimc v0.16b,v0.16b 3288 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 3289 aesd v0.16b,v18.16b 3290 aesimc v0.16b,v0.16b 3291 aesd v0.16b,v19.16b 3292 aesimc v0.16b,v0.16b 3293 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 3294 aesd v0.16b,v20.16b 3295 aesimc v0.16b,v0.16b 3296 aesd v0.16b,v21.16b 3297 aesimc v0.16b,v0.16b 3298 ld1 {v7.4s},[x3] 3299 aesd v0.16b,v22.16b 3300 aesimc v0.16b,v0.16b 3301 aesd v0.16b,v23.16b 3302 eor v0.16b,v0.16b,v7.16b 3303 eor v0.16b,v6.16b,v0.16b 3304 st1 {v0.16b},[x1] 3305 b .Lxts_dec_final_abort 3306 .Lxts_dec_big_size: 3307 stp x19,x20,[sp,#-64]! 3308 stp x21,x22,[sp,#48] 3309 stp d8,d9,[sp,#32] 3310 stp d10,d11,[sp,#16] 3311 3312 and x21,x2,#0xf 3313 and x2,x2,#-16 3314 subs x2,x2,#16 3315 mov x8,#16 3316 b.lo .Lxts_dec_abort 3317 3318 // Encrypt the iv with key2, as the first XEX iv 3319 ldr w6,[x4,#240] 3320 ld1 {v0.4s},[x4],#16 3321 ld1 {v6.16b},[x5] 3322 sub w6,w6,#2 3323 ld1 {v1.4s},[x4],#16 3324 3325 .Loop_dec_iv_enc: 3326 aese v6.16b,v0.16b 3327 aesmc v6.16b,v6.16b 3328 ld1 {v0.4s},[x4],#16 3329 subs w6,w6,#2 3330 aese v6.16b,v1.16b 3331 aesmc v6.16b,v6.16b 3332 ld1 {v1.4s},[x4],#16 3333 b.gt .Loop_dec_iv_enc 3334 3335 aese v6.16b,v0.16b 3336 aesmc v6.16b,v6.16b 3337 ld1 {v0.4s},[x4] 3338 aese v6.16b,v1.16b 3339 eor v6.16b,v6.16b,v0.16b 3340 3341 // The iv for second block 3342 // x9- iv(low), x10 - iv(high) 3343 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 3344 fmov x9,d6 3345 fmov x10,v6.d[1] 3346 mov w19,#0x87 3347 extr x22,x10,x10,#32 3348 extr x10,x10,x9,#63 3349 and w11,w19,w22,asr #31 3350 eor x9,x11,x9,lsl #1 3351 fmov d8,x9 3352 fmov v8.d[1],x10 3353 3354 ldr w5,[x3,#240] // load rounds number 3355 3356 // The iv for third block 3357 extr x22,x10,x10,#32 3358 extr x10,x10,x9,#63 3359 and w11,w19,w22,asr #31 3360 eor x9,x11,x9,lsl #1 3361 fmov d9,x9 3362 fmov v9.d[1],x10 3363 3364 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 3365 sub w5,w5,#6 3366 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 3367 sub w5,w5,#2 3368 ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule... 3369 ld1 {v20.4s,v21.4s},[x7],#32 3370 ld1 {v22.4s,v23.4s},[x7],#32 3371 ld1 {v7.4s},[x7] 3372 3373 // The iv for fourth block 3374 extr x22,x10,x10,#32 3375 extr x10,x10,x9,#63 3376 and w11,w19,w22,asr #31 3377 eor x9,x11,x9,lsl #1 3378 fmov d10,x9 3379 fmov v10.d[1],x10 3380 3381 add x7,x3,#32 3382 mov w6,w5 3383 b .Lxts_dec 3384 3385 // Decryption 3386 .align 5 3387 .Lxts_dec: 3388 tst x21,#0xf 3389 b.eq .Lxts_dec_begin 3390 subs x2,x2,#16 3391 csel x8,xzr,x8,eq 3392 ld1 {v0.16b},[x0],#16 3393 b.lo .Lxts_done 3394 sub x0,x0,#16 3395 .Lxts_dec_begin: 3396 ld1 {v0.16b},[x0],x8 3397 subs x2,x2,#32 // bias 3398 add w6,w5,#2 3399 orr v3.16b,v0.16b,v0.16b 3400 orr v1.16b,v0.16b,v0.16b 3401 orr v28.16b,v0.16b,v0.16b 3402 ld1 {v24.16b},[x0],#16 3403 orr v27.16b,v24.16b,v24.16b 3404 orr v29.16b,v24.16b,v24.16b 3405 b.lo .Lxts_inner_dec_tail 3406 eor v0.16b,v0.16b,v6.16b // before decryt, xor with iv 3407 eor v24.16b,v24.16b,v8.16b 3408 3409 orr v1.16b,v24.16b,v24.16b 3410 ld1 {v24.16b},[x0],#16 3411 orr v2.16b,v0.16b,v0.16b 3412 orr v3.16b,v1.16b,v1.16b 3413 eor v27.16b,v24.16b,v9.16b // third block xox with third iv 3414 eor v24.16b,v24.16b,v9.16b 3415 cmp x2,#32 3416 b.lo .Lxts_outer_dec_tail 3417 3418 ld1 {v25.16b},[x0],#16 3419 3420 // The iv for fifth block 3421 extr x22,x10,x10,#32 3422 extr x10,x10,x9,#63 3423 and w11,w19,w22,asr #31 3424 eor x9,x11,x9,lsl #1 3425 fmov d11,x9 3426 fmov v11.d[1],x10 3427 3428 ld1 {v26.16b},[x0],#16 3429 eor v25.16b,v25.16b,v10.16b // the fourth block 3430 eor v26.16b,v26.16b,v11.16b 3431 sub x2,x2,#32 // bias 3432 mov w6,w5 3433 b .Loop5x_xts_dec 3434 3435 .align 4 3436 .Loop5x_xts_dec: 3437 aesd v0.16b,v16.16b 3438 aesimc v0.16b,v0.16b 3439 aesd v1.16b,v16.16b 3440 aesimc v1.16b,v1.16b 3441 aesd v24.16b,v16.16b 3442 aesimc v24.16b,v24.16b 3443 aesd v25.16b,v16.16b 3444 aesimc v25.16b,v25.16b 3445 aesd v26.16b,v16.16b 3446 aesimc v26.16b,v26.16b 3447 ld1 {v16.4s},[x7],#16 // load key schedule... 3448 subs w6,w6,#2 3449 aesd v0.16b,v17.16b 3450 aesimc v0.16b,v0.16b 3451 aesd v1.16b,v17.16b 3452 aesimc v1.16b,v1.16b 3453 aesd v24.16b,v17.16b 3454 aesimc v24.16b,v24.16b 3455 aesd v25.16b,v17.16b 3456 aesimc v25.16b,v25.16b 3457 aesd v26.16b,v17.16b 3458 aesimc v26.16b,v26.16b 3459 ld1 {v17.4s},[x7],#16 // load key schedule... 3460 b.gt .Loop5x_xts_dec 3461 3462 aesd v0.16b,v16.16b 3463 aesimc v0.16b,v0.16b 3464 aesd v1.16b,v16.16b 3465 aesimc v1.16b,v1.16b 3466 aesd v24.16b,v16.16b 3467 aesimc v24.16b,v24.16b 3468 aesd v25.16b,v16.16b 3469 aesimc v25.16b,v25.16b 3470 aesd v26.16b,v16.16b 3471 aesimc v26.16b,v26.16b 3472 subs x2,x2,#0x50 // because .Lxts_dec_tail4x 3473 3474 aesd v0.16b,v17.16b 3475 aesimc v0.16b,v0.16b 3476 aesd v1.16b,v17.16b 3477 aesimc v1.16b,v1.16b 3478 aesd v24.16b,v17.16b 3479 aesimc v24.16b,v24.16b 3480 aesd v25.16b,v17.16b 3481 aesimc v25.16b,v25.16b 3482 aesd v26.16b,v17.16b 3483 aesimc v26.16b,v26.16b 3484 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 3485 mov x7,x3 3486 3487 aesd v0.16b,v18.16b 3488 aesimc v0.16b,v0.16b 3489 aesd v1.16b,v18.16b 3490 aesimc v1.16b,v1.16b 3491 aesd v24.16b,v18.16b 3492 aesimc v24.16b,v24.16b 3493 aesd v25.16b,v18.16b 3494 aesimc v25.16b,v25.16b 3495 aesd v26.16b,v18.16b 3496 aesimc v26.16b,v26.16b 3497 add x0,x0,x6 // x0 is adjusted in such way that 3498 // at exit from the loop v1.16b-v26.16b 3499 // are loaded with last "words" 3500 add x6,x2,#0x60 // because .Lxts_dec_tail4x 3501 3502 aesd v0.16b,v19.16b 3503 aesimc v0.16b,v0.16b 3504 aesd v1.16b,v19.16b 3505 aesimc v1.16b,v1.16b 3506 aesd v24.16b,v19.16b 3507 aesimc v24.16b,v24.16b 3508 aesd v25.16b,v19.16b 3509 aesimc v25.16b,v25.16b 3510 aesd v26.16b,v19.16b 3511 aesimc v26.16b,v26.16b 3512 3513 aesd v0.16b,v20.16b 3514 aesimc v0.16b,v0.16b 3515 aesd v1.16b,v20.16b 3516 aesimc v1.16b,v1.16b 3517 aesd v24.16b,v20.16b 3518 aesimc v24.16b,v24.16b 3519 aesd v25.16b,v20.16b 3520 aesimc v25.16b,v25.16b 3521 aesd v26.16b,v20.16b 3522 aesimc v26.16b,v26.16b 3523 3524 aesd v0.16b,v21.16b 3525 aesimc v0.16b,v0.16b 3526 aesd v1.16b,v21.16b 3527 aesimc v1.16b,v1.16b 3528 aesd v24.16b,v21.16b 3529 aesimc v24.16b,v24.16b 3530 aesd v25.16b,v21.16b 3531 aesimc v25.16b,v25.16b 3532 aesd v26.16b,v21.16b 3533 aesimc v26.16b,v26.16b 3534 3535 aesd v0.16b,v22.16b 3536 aesimc v0.16b,v0.16b 3537 aesd v1.16b,v22.16b 3538 aesimc v1.16b,v1.16b 3539 aesd v24.16b,v22.16b 3540 aesimc v24.16b,v24.16b 3541 aesd v25.16b,v22.16b 3542 aesimc v25.16b,v25.16b 3543 aesd v26.16b,v22.16b 3544 aesimc v26.16b,v26.16b 3545 3546 eor v4.16b,v7.16b,v6.16b 3547 aesd v0.16b,v23.16b 3548 // The iv for first block of next iteration. 3549 extr x22,x10,x10,#32 3550 extr x10,x10,x9,#63 3551 and w11,w19,w22,asr #31 3552 eor x9,x11,x9,lsl #1 3553 fmov d6,x9 3554 fmov v6.d[1],x10 3555 eor v5.16b,v7.16b,v8.16b 3556 ld1 {v2.16b},[x0],#16 3557 aesd v1.16b,v23.16b 3558 // The iv for second block 3559 extr x22,x10,x10,#32 3560 extr x10,x10,x9,#63 3561 and w11,w19,w22,asr #31 3562 eor x9,x11,x9,lsl #1 3563 fmov d8,x9 3564 fmov v8.d[1],x10 3565 eor v17.16b,v7.16b,v9.16b 3566 ld1 {v3.16b},[x0],#16 3567 aesd v24.16b,v23.16b 3568 // The iv for third block 3569 extr x22,x10,x10,#32 3570 extr x10,x10,x9,#63 3571 and w11,w19,w22,asr #31 3572 eor x9,x11,x9,lsl #1 3573 fmov d9,x9 3574 fmov v9.d[1],x10 3575 eor v30.16b,v7.16b,v10.16b 3576 ld1 {v27.16b},[x0],#16 3577 aesd v25.16b,v23.16b 3578 // The iv for fourth block 3579 extr x22,x10,x10,#32 3580 extr x10,x10,x9,#63 3581 and w11,w19,w22,asr #31 3582 eor x9,x11,x9,lsl #1 3583 fmov d10,x9 3584 fmov v10.d[1],x10 3585 eor v31.16b,v7.16b,v11.16b 3586 ld1 {v28.16b},[x0],#16 3587 aesd v26.16b,v23.16b 3588 3589 // The iv for fifth block 3590 extr x22,x10,x10,#32 3591 extr x10,x10,x9,#63 3592 and w11,w19,w22,asr #31 3593 eor x9,x11,x9,lsl #1 3594 fmov d11,x9 3595 fmov v11.d[1],x10 3596 3597 ld1 {v29.16b},[x0],#16 3598 cbz x6,.Lxts_dec_tail4x 3599 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3600 eor v4.16b,v4.16b,v0.16b 3601 eor v0.16b,v2.16b,v6.16b 3602 eor v5.16b,v5.16b,v1.16b 3603 eor v1.16b,v3.16b,v8.16b 3604 eor v17.16b,v17.16b,v24.16b 3605 eor v24.16b,v27.16b,v9.16b 3606 eor v30.16b,v30.16b,v25.16b 3607 eor v25.16b,v28.16b,v10.16b 3608 eor v31.16b,v31.16b,v26.16b 3609 st1 {v4.16b},[x1],#16 3610 eor v26.16b,v29.16b,v11.16b 3611 st1 {v5.16b},[x1],#16 3612 mov w6,w5 3613 st1 {v17.16b},[x1],#16 3614 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3615 st1 {v30.16b},[x1],#16 3616 st1 {v31.16b},[x1],#16 3617 b.hs .Loop5x_xts_dec 3618 3619 cmn x2,#0x10 3620 b.ne .Loop5x_dec_after 3621 // If x2(x2) equal to -0x10, the left blocks is 4. 3622 // After specially processing, utilize the five blocks processing again. 3623 // It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b. 3624 orr v11.16b,v10.16b,v10.16b 3625 orr v10.16b,v9.16b,v9.16b 3626 orr v9.16b,v8.16b,v8.16b 3627 orr v8.16b,v6.16b,v6.16b 3628 fmov x9,d11 3629 fmov x10,v11.d[1] 3630 eor v0.16b,v6.16b,v2.16b 3631 eor v1.16b,v8.16b,v3.16b 3632 eor v24.16b,v27.16b,v9.16b 3633 eor v25.16b,v28.16b,v10.16b 3634 eor v26.16b,v29.16b,v11.16b 3635 b.eq .Loop5x_xts_dec 3636 3637 .Loop5x_dec_after: 3638 add x2,x2,#0x50 3639 cbz x2,.Lxts_done 3640 3641 add w6,w5,#2 3642 subs x2,x2,#0x30 3643 b.lo .Lxts_inner_dec_tail 3644 3645 eor v0.16b,v6.16b,v27.16b 3646 eor v1.16b,v8.16b,v28.16b 3647 eor v24.16b,v29.16b,v9.16b 3648 b .Lxts_outer_dec_tail 3649 3650 .align 4 3651 .Lxts_dec_tail4x: 3652 add x0,x0,#16 3653 tst x21,#0xf 3654 eor v5.16b,v1.16b,v4.16b 3655 st1 {v5.16b},[x1],#16 3656 eor v17.16b,v24.16b,v17.16b 3657 st1 {v17.16b},[x1],#16 3658 eor v30.16b,v25.16b,v30.16b 3659 eor v31.16b,v26.16b,v31.16b 3660 st1 {v30.16b,v31.16b},[x1],#32 3661 3662 b.eq .Lxts_dec_abort 3663 ld1 {v0.16b},[x0],#16 3664 b .Lxts_done 3665 .align 4 3666 .Lxts_outer_dec_tail: 3667 aesd v0.16b,v16.16b 3668 aesimc v0.16b,v0.16b 3669 aesd v1.16b,v16.16b 3670 aesimc v1.16b,v1.16b 3671 aesd v24.16b,v16.16b 3672 aesimc v24.16b,v24.16b 3673 ld1 {v16.4s},[x7],#16 3674 subs w6,w6,#2 3675 aesd v0.16b,v17.16b 3676 aesimc v0.16b,v0.16b 3677 aesd v1.16b,v17.16b 3678 aesimc v1.16b,v1.16b 3679 aesd v24.16b,v17.16b 3680 aesimc v24.16b,v24.16b 3681 ld1 {v17.4s},[x7],#16 3682 b.gt .Lxts_outer_dec_tail 3683 3684 aesd v0.16b,v16.16b 3685 aesimc v0.16b,v0.16b 3686 aesd v1.16b,v16.16b 3687 aesimc v1.16b,v1.16b 3688 aesd v24.16b,v16.16b 3689 aesimc v24.16b,v24.16b 3690 eor v4.16b,v6.16b,v7.16b 3691 subs x2,x2,#0x30 3692 // The iv for first block 3693 fmov x9,d9 3694 fmov x10,v9.d[1] 3695 mov w19,#0x87 3696 extr x22,x10,x10,#32 3697 extr x10,x10,x9,#63 3698 and w11,w19,w22,asr #31 3699 eor x9,x11,x9,lsl #1 3700 fmov d6,x9 3701 fmov v6.d[1],x10 3702 eor v5.16b,v8.16b,v7.16b 3703 csel x6,x2,x6,lo // x6, w6, is zero at this point 3704 aesd v0.16b,v17.16b 3705 aesimc v0.16b,v0.16b 3706 aesd v1.16b,v17.16b 3707 aesimc v1.16b,v1.16b 3708 aesd v24.16b,v17.16b 3709 aesimc v24.16b,v24.16b 3710 eor v17.16b,v9.16b,v7.16b 3711 // The iv for second block 3712 extr x22,x10,x10,#32 3713 extr x10,x10,x9,#63 3714 and w11,w19,w22,asr #31 3715 eor x9,x11,x9,lsl #1 3716 fmov d8,x9 3717 fmov v8.d[1],x10 3718 3719 add x6,x6,#0x20 3720 add x0,x0,x6 // x0 is adjusted to the last data 3721 3722 mov x7,x3 3723 3724 // The iv for third block 3725 extr x22,x10,x10,#32 3726 extr x10,x10,x9,#63 3727 and w11,w19,w22,asr #31 3728 eor x9,x11,x9,lsl #1 3729 fmov d9,x9 3730 fmov v9.d[1],x10 3731 3732 aesd v0.16b,v20.16b 3733 aesimc v0.16b,v0.16b 3734 aesd v1.16b,v20.16b 3735 aesimc v1.16b,v1.16b 3736 aesd v24.16b,v20.16b 3737 aesimc v24.16b,v24.16b 3738 aesd v0.16b,v21.16b 3739 aesimc v0.16b,v0.16b 3740 aesd v1.16b,v21.16b 3741 aesimc v1.16b,v1.16b 3742 aesd v24.16b,v21.16b 3743 aesimc v24.16b,v24.16b 3744 aesd v0.16b,v22.16b 3745 aesimc v0.16b,v0.16b 3746 aesd v1.16b,v22.16b 3747 aesimc v1.16b,v1.16b 3748 aesd v24.16b,v22.16b 3749 aesimc v24.16b,v24.16b 3750 ld1 {v27.16b},[x0],#16 3751 aesd v0.16b,v23.16b 3752 aesd v1.16b,v23.16b 3753 aesd v24.16b,v23.16b 3754 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3755 add w6,w5,#2 3756 eor v4.16b,v4.16b,v0.16b 3757 eor v5.16b,v5.16b,v1.16b 3758 eor v24.16b,v24.16b,v17.16b 3759 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3760 st1 {v4.16b},[x1],#16 3761 st1 {v5.16b},[x1],#16 3762 st1 {v24.16b},[x1],#16 3763 3764 cmn x2,#0x30 3765 add x2,x2,#0x30 3766 b.eq .Lxts_done 3767 sub x2,x2,#0x30 3768 orr v28.16b,v3.16b,v3.16b 3769 orr v29.16b,v27.16b,v27.16b 3770 nop 3771 3772 .Lxts_inner_dec_tail: 3773 // x2 == -0x10 means two blocks left. 3774 cmn x2,#0x10 3775 eor v1.16b,v28.16b,v6.16b 3776 eor v24.16b,v29.16b,v8.16b 3777 b.eq .Lxts_dec_tail_loop 3778 eor v24.16b,v29.16b,v6.16b 3779 .Lxts_dec_tail_loop: 3780 aesd v1.16b,v16.16b 3781 aesimc v1.16b,v1.16b 3782 aesd v24.16b,v16.16b 3783 aesimc v24.16b,v24.16b 3784 ld1 {v16.4s},[x7],#16 3785 subs w6,w6,#2 3786 aesd v1.16b,v17.16b 3787 aesimc v1.16b,v1.16b 3788 aesd v24.16b,v17.16b 3789 aesimc v24.16b,v24.16b 3790 ld1 {v17.4s},[x7],#16 3791 b.gt .Lxts_dec_tail_loop 3792 3793 aesd v1.16b,v16.16b 3794 aesimc v1.16b,v1.16b 3795 aesd v24.16b,v16.16b 3796 aesimc v24.16b,v24.16b 3797 aesd v1.16b,v17.16b 3798 aesimc v1.16b,v1.16b 3799 aesd v24.16b,v17.16b 3800 aesimc v24.16b,v24.16b 3801 aesd v1.16b,v20.16b 3802 aesimc v1.16b,v1.16b 3803 aesd v24.16b,v20.16b 3804 aesimc v24.16b,v24.16b 3805 cmn x2,#0x20 3806 aesd v1.16b,v21.16b 3807 aesimc v1.16b,v1.16b 3808 aesd v24.16b,v21.16b 3809 aesimc v24.16b,v24.16b 3810 eor v5.16b,v6.16b,v7.16b 3811 aesd v1.16b,v22.16b 3812 aesimc v1.16b,v1.16b 3813 aesd v24.16b,v22.16b 3814 aesimc v24.16b,v24.16b 3815 eor v17.16b,v8.16b,v7.16b 3816 aesd v1.16b,v23.16b 3817 aesd v24.16b,v23.16b 3818 b.eq .Lxts_dec_one 3819 eor v5.16b,v5.16b,v1.16b 3820 eor v17.16b,v17.16b,v24.16b 3821 orr v6.16b,v9.16b,v9.16b 3822 orr v8.16b,v10.16b,v10.16b 3823 st1 {v5.16b},[x1],#16 3824 st1 {v17.16b},[x1],#16 3825 add x2,x2,#16 3826 b .Lxts_done 3827 3828 .Lxts_dec_one: 3829 eor v5.16b,v5.16b,v24.16b 3830 orr v6.16b,v8.16b,v8.16b 3831 orr v8.16b,v9.16b,v9.16b 3832 st1 {v5.16b},[x1],#16 3833 add x2,x2,#32 3834 3835 .Lxts_done: 3836 tst x21,#0xf 3837 b.eq .Lxts_dec_abort 3838 // Processing the last two blocks with cipher stealing. 3839 mov x7,x3 3840 cbnz x2,.Lxts_dec_1st_done 3841 ld1 {v0.16b},[x0],#16 3842 3843 // Decrypt the last second block to get the last plain text block 3844 .Lxts_dec_1st_done: 3845 eor v26.16b,v0.16b,v8.16b 3846 ldr w6,[x3,#240] 3847 ld1 {v0.4s},[x3],#16 3848 sub w6,w6,#2 3849 ld1 {v1.4s},[x3],#16 3850 .Loop_final_2nd_dec: 3851 aesd v26.16b,v0.16b 3852 aesimc v26.16b,v26.16b 3853 ld1 {v0.4s},[x3],#16 // load key schedule... 3854 subs w6,w6,#2 3855 aesd v26.16b,v1.16b 3856 aesimc v26.16b,v26.16b 3857 ld1 {v1.4s},[x3],#16 // load key schedule... 3858 b.gt .Loop_final_2nd_dec 3859 3860 aesd v26.16b,v0.16b 3861 aesimc v26.16b,v26.16b 3862 ld1 {v0.4s},[x3] 3863 aesd v26.16b,v1.16b 3864 eor v26.16b,v26.16b,v0.16b 3865 eor v26.16b,v26.16b,v8.16b 3866 st1 {v26.16b},[x1] 3867 3868 mov x20,x0 3869 add x13,x1,#16 3870 3871 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3872 // to get the last encrypted block. 3873 .composite_dec_loop: 3874 subs x21,x21,#1 3875 ldrb w15,[x1,x21] 3876 ldrb w14,[x20,x21] 3877 strb w15,[x13,x21] 3878 strb w14,[x1,x21] 3879 b.gt .composite_dec_loop 3880 .Lxts_dec_load_done: 3881 ld1 {v26.16b},[x1] 3882 eor v26.16b,v26.16b,v6.16b 3883 3884 // Decrypt the composite block to get the last second plain text block 3885 ldr w6,[x7,#240] 3886 ld1 {v0.4s},[x7],#16 3887 sub w6,w6,#2 3888 ld1 {v1.4s},[x7],#16 3889 .Loop_final_dec: 3890 aesd v26.16b,v0.16b 3891 aesimc v26.16b,v26.16b 3892 ld1 {v0.4s},[x7],#16 // load key schedule... 3893 subs w6,w6,#2 3894 aesd v26.16b,v1.16b 3895 aesimc v26.16b,v26.16b 3896 ld1 {v1.4s},[x7],#16 // load key schedule... 3897 b.gt .Loop_final_dec 3898 3899 aesd v26.16b,v0.16b 3900 aesimc v26.16b,v26.16b 3901 ld1 {v0.4s},[x7] 3902 aesd v26.16b,v1.16b 3903 eor v26.16b,v26.16b,v0.16b 3904 eor v26.16b,v26.16b,v6.16b 3905 st1 {v26.16b},[x1] 3906 3907 .Lxts_dec_abort: 3908 ldp x21,x22,[sp,#48] 3909 ldp d8,d9,[sp,#32] 3910 ldp d10,d11,[sp,#16] 3911 ldp x19,x20,[sp],#64 3912 3913 .Lxts_dec_final_abort: 3914 ret 3915 .size aes_v8_xts_decrypt,.-aes_v8_xts_decrypt 3916 #endif 3917