1 #include "arm_asm.h" 2 #include "arm_arch.h" 3 4 #if __ARM_MAX_ARCH__>=8 5 .fpu neon 6 #ifdef __thumb2__ 7 .syntax unified 8 .thumb 9 # define INST(a,b,c,d) c,0xef,a,b 10 #else 11 .code 32 12 # define INST(a,b,c,d) a,b,c,0xf2 13 #endif 14 15 .text 16 .globl aes_gcm_enc_128_kernel 17 .type aes_gcm_enc_128_kernel,%function 18 .align 4 19 aes_gcm_enc_128_kernel: 20 AARCH64_VALID_CALL_TARGET 21 cbz r1, .L128_enc_ret 22 stp r19, r20, [sp, #-112]! 23 mov r16, r4 24 mov r8, r5 25 stp r21, r22, [sp, #16] 26 stp r23, r24, [sp, #32] 27 stp d8, d9, [sp, #48] 28 stp d10, d11, [sp, #64] 29 stp d12, d13, [sp, #80] 30 stp d14, d15, [sp, #96] 31 32 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 33 #ifdef __ARMEB__ 34 rev r10, r10 35 rev r11, r11 36 #endif 37 ldp r13, r14, [r8, #160] @ load rk10 38 #ifdef __ARMEB__ 39 ror r13, r13, #32 40 ror r14, r14, #32 41 #endif 42 ld1 {v11.16b}, [r3] 43 ext v11.16b, v11.16b, v11.16b, #8 44 rev64 v11.16b, v11.16b 45 lsr r5, r1, #3 @ byte_len 46 mov r15, r5 47 48 ld1 {v18.4s}, [r8], #16 @ load rk0 49 add r4, r0, r1, lsr #3 @ end_input_ptr 50 sub r5, r5, #1 @ byte_len - 1 51 52 lsr r12, r11, #32 53 ldr q15, [r3, #112] @ load h4l | h4h 54 #ifndef __ARMEB__ 55 ext v15.16b, v15.16b, v15.16b, #8 56 #endif 57 fmov d1, r10 @ CTR block 1 58 rev r12, r12 @ rev_ctr32 59 60 add r12, r12, #1 @ increment rev_ctr32 61 orr r11, r11, r11 62 ld1 {v19.4s}, [r8], #16 @ load rk1 63 64 rev r9, r12 @ CTR block 1 65 add r12, r12, #1 @ CTR block 1 66 fmov d3, r10 @ CTR block 3 67 68 orr r9, r11, r9, lsl #32 @ CTR block 1 69 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 70 71 fmov v1.d[1], r9 @ CTR block 1 72 rev r9, r12 @ CTR block 2 73 74 fmov d2, r10 @ CTR block 2 75 orr r9, r11, r9, lsl #32 @ CTR block 2 76 add r12, r12, #1 @ CTR block 2 77 78 fmov v2.d[1], r9 @ CTR block 2 79 rev r9, r12 @ CTR block 3 80 81 orr r9, r11, r9, lsl #32 @ CTR block 3 82 ld1 {v20.4s}, [r8], #16 @ load rk2 83 84 add r12, r12, #1 @ CTR block 3 85 fmov v3.d[1], r9 @ CTR block 3 86 87 ldr q14, [r3, #80] @ load h3l | h3h 88 #ifndef __ARMEB__ 89 ext v14.16b, v14.16b, v14.16b, #8 90 #endif 91 aese q1, v18.16b 92 aesmc q1, q1 @ AES block 1 - round 0 93 ld1 {v21.4s}, [r8], #16 @ load rk3 94 95 aese q2, v18.16b 96 aesmc q2, q2 @ AES block 2 - round 0 97 ldr q12, [r3, #32] @ load h1l | h1h 98 #ifndef __ARMEB__ 99 ext v12.16b, v12.16b, v12.16b, #8 100 #endif 101 102 aese q0, v18.16b 103 aesmc q0, q0 @ AES block 0 - round 0 104 ld1 {v22.4s}, [r8], #16 @ load rk4 105 106 aese q3, v18.16b 107 aesmc q3, q3 @ AES block 3 - round 0 108 ld1 {v23.4s}, [r8], #16 @ load rk5 109 110 aese q2, v19.16b 111 aesmc q2, q2 @ AES block 2 - round 1 112 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 113 114 aese q0, v19.16b 115 aesmc q0, q0 @ AES block 0 - round 1 116 ld1 {v24.4s}, [r8], #16 @ load rk6 117 118 aese q1, v19.16b 119 aesmc q1, q1 @ AES block 1 - round 1 120 ld1 {v25.4s}, [r8], #16 @ load rk7 121 122 aese q3, v19.16b 123 aesmc q3, q3 @ AES block 3 - round 1 124 trn1 q9, v14.2d, v15.2d @ h4h | h3h 125 126 aese q0, v20.16b 127 aesmc q0, q0 @ AES block 0 - round 2 128 ld1 {v26.4s}, [r8], #16 @ load rk8 129 130 aese q1, v20.16b 131 aesmc q1, q1 @ AES block 1 - round 2 132 ldr q13, [r3, #64] @ load h2l | h2h 133 #ifndef __ARMEB__ 134 ext v13.16b, v13.16b, v13.16b, #8 135 #endif 136 137 aese q3, v20.16b 138 aesmc q3, q3 @ AES block 3 - round 2 139 140 aese q2, v20.16b 141 aesmc q2, q2 @ AES block 2 - round 2 142 eor v17.16b, v17.16b, q9 @ h4k | h3k 143 144 aese q0, v21.16b 145 aesmc q0, q0 @ AES block 0 - round 3 146 147 aese q1, v21.16b 148 aesmc q1, q1 @ AES block 1 - round 3 149 150 aese q2, v21.16b 151 aesmc q2, q2 @ AES block 2 - round 3 152 ld1 {v27.4s}, [r8], #16 @ load rk9 153 154 aese q3, v21.16b 155 aesmc q3, q3 @ AES block 3 - round 3 156 157 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 158 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 159 160 aese q3, v22.16b 161 aesmc q3, q3 @ AES block 3 - round 4 162 add r5, r5, r0 163 164 aese q2, v22.16b 165 aesmc q2, q2 @ AES block 2 - round 4 166 cmp r0, r5 @ check if we have <= 4 blocks 167 168 aese q0, v22.16b 169 aesmc q0, q0 @ AES block 0 - round 4 170 171 aese q3, v23.16b 172 aesmc q3, q3 @ AES block 3 - round 5 173 174 aese q2, v23.16b 175 aesmc q2, q2 @ AES block 2 - round 5 176 177 aese q0, v23.16b 178 aesmc q0, q0 @ AES block 0 - round 5 179 180 aese q3, v24.16b 181 aesmc q3, q3 @ AES block 3 - round 6 182 183 aese q1, v22.16b 184 aesmc q1, q1 @ AES block 1 - round 4 185 186 aese q2, v24.16b 187 aesmc q2, q2 @ AES block 2 - round 6 188 trn1 q8, v12.2d, v13.2d @ h2h | h1h 189 190 aese q0, v24.16b 191 aesmc q0, q0 @ AES block 0 - round 6 192 193 aese q1, v23.16b 194 aesmc q1, q1 @ AES block 1 - round 5 195 196 aese q3, v25.16b 197 aesmc q3, q3 @ AES block 3 - round 7 198 199 aese q0, v25.16b 200 aesmc q0, q0 @ AES block 0 - round 7 201 202 aese q1, v24.16b 203 aesmc q1, q1 @ AES block 1 - round 6 204 205 aese q2, v25.16b 206 aesmc q2, q2 @ AES block 2 - round 7 207 208 aese q0, v26.16b 209 aesmc q0, q0 @ AES block 0 - round 8 210 211 aese q1, v25.16b 212 aesmc q1, q1 @ AES block 1 - round 7 213 214 aese q2, v26.16b 215 aesmc q2, q2 @ AES block 2 - round 8 216 217 aese q3, v26.16b 218 aesmc q3, q3 @ AES block 3 - round 8 219 220 aese q1, v26.16b 221 aesmc q1, q1 @ AES block 1 - round 8 222 223 aese q2, v27.16b @ AES block 2 - round 9 224 225 aese q0, v27.16b @ AES block 0 - round 9 226 227 eor v16.16b, v16.16b, q8 @ h2k | h1k 228 229 aese q1, v27.16b @ AES block 1 - round 9 230 231 aese q3, v27.16b @ AES block 3 - round 9 232 bge .L128_enc_tail @ handle tail 233 234 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext 235 #ifdef __ARMEB__ 236 rev r6, r6 237 rev r7, r7 238 #endif 239 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext 240 #ifdef __ARMEB__ 241 rev r21, r21 242 rev r22, r22 243 #endif 244 ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext 245 #ifdef __ARMEB__ 246 rev r19, r19 247 rev r20, r20 248 #endif 249 ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext 250 #ifdef __ARMEB__ 251 rev r23, r23 252 rev r24, r24 253 #endif 254 eor r6, r6, r13 @ AES block 0 - round 10 low 255 eor r7, r7, r14 @ AES block 0 - round 10 high 256 257 eor r21, r21, r13 @ AES block 2 - round 10 low 258 fmov d4, r6 @ AES block 0 - mov low 259 260 eor r19, r19, r13 @ AES block 1 - round 10 low 261 eor r22, r22, r14 @ AES block 2 - round 10 high 262 fmov v4.d[1], r7 @ AES block 0 - mov high 263 264 fmov d5, r19 @ AES block 1 - mov low 265 eor r20, r20, r14 @ AES block 1 - round 10 high 266 267 eor r23, r23, r13 @ AES block 3 - round 10 low 268 fmov v5.d[1], r20 @ AES block 1 - mov high 269 270 fmov d6, r21 @ AES block 2 - mov low 271 eor r24, r24, r14 @ AES block 3 - round 10 high 272 rev r9, r12 @ CTR block 4 273 274 fmov v6.d[1], r22 @ AES block 2 - mov high 275 orr r9, r11, r9, lsl #32 @ CTR block 4 276 277 eor q4, q4, q0 @ AES block 0 - result 278 fmov d0, r10 @ CTR block 4 279 add r12, r12, #1 @ CTR block 4 280 281 fmov v0.d[1], r9 @ CTR block 4 282 rev r9, r12 @ CTR block 5 283 284 eor q5, q5, q1 @ AES block 1 - result 285 fmov d1, r10 @ CTR block 5 286 orr r9, r11, r9, lsl #32 @ CTR block 5 287 288 add r12, r12, #1 @ CTR block 5 289 add r0, r0, #64 @ AES input_ptr update 290 fmov v1.d[1], r9 @ CTR block 5 291 292 fmov d7, r23 @ AES block 3 - mov low 293 rev r9, r12 @ CTR block 6 294 st1 { q4}, [r2], #16 @ AES block 0 - store result 295 296 fmov v7.d[1], r24 @ AES block 3 - mov high 297 orr r9, r11, r9, lsl #32 @ CTR block 6 298 299 add r12, r12, #1 @ CTR block 6 300 eor q6, q6, q2 @ AES block 2 - result 301 st1 { q5}, [r2], #16 @ AES block 1 - store result 302 303 fmov d2, r10 @ CTR block 6 304 cmp r0, r5 @ check if we have <= 8 blocks 305 306 fmov v2.d[1], r9 @ CTR block 6 307 rev r9, r12 @ CTR block 7 308 st1 { q6}, [r2], #16 @ AES block 2 - store result 309 310 orr r9, r11, r9, lsl #32 @ CTR block 7 311 312 eor q7, q7, q3 @ AES block 3 - result 313 st1 { q7}, [r2], #16 @ AES block 3 - store result 314 bge .L128_enc_prepretail @ do prepretail 315 316 .L128_enc_main_loop:@ main loop start 317 ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext 318 #ifdef __ARMEB__ 319 rev r23, r23 320 rev r24, r24 321 #endif 322 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 323 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 324 325 aese q2, v18.16b 326 aesmc q2, q2 @ AES block 4k+6 - round 0 327 fmov d3, r10 @ CTR block 4k+3 328 329 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 330 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 331 332 aese q1, v18.16b 333 aesmc q1, q1 @ AES block 4k+5 - round 0 334 add r12, r12, #1 @ CTR block 4k+3 335 fmov v3.d[1], r9 @ CTR block 4k+3 336 337 aese q0, v18.16b 338 aesmc q0, q0 @ AES block 4k+4 - round 0 339 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 340 341 aese q2, v19.16b 342 aesmc q2, q2 @ AES block 4k+6 - round 1 343 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 344 345 aese q1, v19.16b 346 aesmc q1, q1 @ AES block 4k+5 - round 1 347 eor q4, q4, v11.16b @ PRE 1 348 349 aese q3, v18.16b 350 aesmc q3, q3 @ AES block 4k+7 - round 0 351 eor r24, r24, r14 @ AES block 4k+3 - round 10 high 352 353 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 354 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 355 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext 356 #ifdef __ARMEB__ 357 rev r6, r6 358 rev r7, r7 359 #endif 360 aese q0, v19.16b 361 aesmc q0, q0 @ AES block 4k+4 - round 1 362 rev r9, r12 @ CTR block 4k+8 363 364 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 365 mov d8, v4.d[1] @ GHASH block 4k - mid 366 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 367 368 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 369 add r12, r12, #1 @ CTR block 4k+8 370 mov d10, v17.d[1] @ GHASH block 4k - mid 371 372 aese q0, v20.16b 373 aesmc q0, q0 @ AES block 4k+4 - round 2 374 375 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 376 eor q8, q8, q4 @ GHASH block 4k - mid 377 378 aese q1, v20.16b 379 aesmc q1, q1 @ AES block 4k+5 - round 2 380 381 aese q0, v21.16b 382 aesmc q0, q0 @ AES block 4k+4 - round 3 383 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 384 385 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 386 387 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 388 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 389 390 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 391 392 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 393 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 394 395 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 396 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 397 398 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 399 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 400 401 aese q3, v19.16b 402 aesmc q3, q3 @ AES block 4k+7 - round 1 403 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 404 405 aese q2, v20.16b 406 aesmc q2, q2 @ AES block 4k+6 - round 2 407 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 408 409 aese q1, v21.16b 410 aesmc q1, q1 @ AES block 4k+5 - round 3 411 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 412 413 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 414 415 aese q2, v21.16b 416 aesmc q2, q2 @ AES block 4k+6 - round 3 417 eor q9, q9, q8 @ GHASH block 4k+2 - high 418 419 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 420 421 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 422 movi q8, #0xc2 423 424 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 425 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 426 427 aese q1, v22.16b 428 aesmc q1, q1 @ AES block 4k+5 - round 4 429 430 aese q3, v20.16b 431 aesmc q3, q3 @ AES block 4k+7 - round 2 432 shl d8, d8, #56 @ mod_constant 433 434 aese q0, v22.16b 435 aesmc q0, q0 @ AES block 4k+4 - round 4 436 eor q9, q9, q4 @ GHASH block 4k+3 - high 437 438 aese q1, v23.16b 439 aesmc q1, q1 @ AES block 4k+5 - round 5 440 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext 441 #ifdef __ARMEB__ 442 rev r19, r19 443 rev r20, r20 444 #endif 445 aese q3, v21.16b 446 aesmc q3, q3 @ AES block 4k+7 - round 3 447 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 448 449 aese q0, v23.16b 450 aesmc q0, q0 @ AES block 4k+4 - round 5 451 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext 452 #ifdef __ARMEB__ 453 rev r21, r21 454 rev r22, r22 455 #endif 456 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 457 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 458 459 aese q2, v22.16b 460 aesmc q2, q2 @ AES block 4k+6 - round 4 461 eor r19, r19, r13 @ AES block 4k+5 - round 10 low 462 463 aese q3, v22.16b 464 aesmc q3, q3 @ AES block 4k+7 - round 4 465 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 466 467 aese q1, v24.16b 468 aesmc q1, q1 @ AES block 4k+5 - round 6 469 eor r23, r23, r13 @ AES block 4k+3 - round 10 low 470 471 aese q2, v23.16b 472 aesmc q2, q2 @ AES block 4k+6 - round 5 473 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 474 475 fmov d4, r6 @ AES block 4k+4 - mov low 476 aese q0, v24.16b 477 aesmc q0, q0 @ AES block 4k+4 - round 6 478 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 479 480 add r0, r0, #64 @ AES input_ptr update 481 fmov d7, r23 @ AES block 4k+3 - mov low 482 ext q9, q9, q9, #8 @ MODULO - other top alignment 483 484 aese q3, v23.16b 485 aesmc q3, q3 @ AES block 4k+7 - round 5 486 fmov d5, r19 @ AES block 4k+5 - mov low 487 488 aese q0, v25.16b 489 aesmc q0, q0 @ AES block 4k+4 - round 7 490 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 491 492 aese q2, v24.16b 493 aesmc q2, q2 @ AES block 4k+6 - round 6 494 eor r20, r20, r14 @ AES block 4k+5 - round 10 high 495 496 aese q1, v25.16b 497 aesmc q1, q1 @ AES block 4k+5 - round 7 498 fmov v5.d[1], r20 @ AES block 4k+5 - mov high 499 500 aese q0, v26.16b 501 aesmc q0, q0 @ AES block 4k+4 - round 8 502 fmov v7.d[1], r24 @ AES block 4k+3 - mov high 503 504 aese q3, v24.16b 505 aesmc q3, q3 @ AES block 4k+7 - round 6 506 cmp r0, r5 @ .LOOP CONTROL 507 508 aese q1, v26.16b 509 aesmc q1, q1 @ AES block 4k+5 - round 8 510 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 511 512 aese q0, v27.16b @ AES block 4k+4 - round 9 513 eor r21, r21, r13 @ AES block 4k+6 - round 10 low 514 eor r22, r22, r14 @ AES block 4k+6 - round 10 high 515 516 aese q3, v25.16b 517 aesmc q3, q3 @ AES block 4k+7 - round 7 518 fmov d6, r21 @ AES block 4k+6 - mov low 519 520 aese q1, v27.16b @ AES block 4k+5 - round 9 521 fmov v6.d[1], r22 @ AES block 4k+6 - mov high 522 523 aese q2, v25.16b 524 aesmc q2, q2 @ AES block 4k+6 - round 7 525 eor q4, q4, q0 @ AES block 4k+4 - result 526 527 fmov d0, r10 @ CTR block 4k+8 528 aese q3, v26.16b 529 aesmc q3, q3 @ AES block 4k+7 - round 8 530 531 fmov v0.d[1], r9 @ CTR block 4k+8 532 rev r9, r12 @ CTR block 4k+9 533 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 534 535 aese q2, v26.16b 536 aesmc q2, q2 @ AES block 4k+6 - round 8 537 eor q5, q5, q1 @ AES block 4k+5 - result 538 539 add r12, r12, #1 @ CTR block 4k+9 540 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 541 fmov d1, r10 @ CTR block 4k+9 542 543 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 544 fmov v1.d[1], r9 @ CTR block 4k+9 545 rev r9, r12 @ CTR block 4k+10 546 547 aese q2, v27.16b @ AES block 4k+6 - round 9 548 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result 549 eor q6, q6, q2 @ AES block 4k+6 - result 550 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 551 552 aese q3, v27.16b @ AES block 4k+7 - round 9 553 add r12, r12, #1 @ CTR block 4k+10 554 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 555 fmov d2, r10 @ CTR block 4k+10 556 557 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 558 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result 559 560 fmov v2.d[1], r9 @ CTR block 4k+10 561 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result 562 rev r9, r12 @ CTR block 4k+11 563 564 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 565 eor q7, q7, q3 @ AES block 4k+3 - result 566 567 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 568 st1 { q7}, [r2], #16 @ AES block 4k+3 - store result 569 blt .L128_enc_main_loop 570 571 .L128_enc_prepretail:@ PREPRETAIL 572 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 573 fmov d3, r10 @ CTR block 4k+3 574 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 575 576 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 577 add r12, r12, #1 @ CTR block 4k+3 578 fmov v3.d[1], r9 @ CTR block 4k+3 579 580 aese q1, v18.16b 581 aesmc q1, q1 @ AES block 4k+5 - round 0 582 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 583 584 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 585 586 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 587 eor q4, q4, v11.16b @ PRE 1 588 589 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 590 591 aese q3, v18.16b 592 aesmc q3, q3 @ AES block 4k+7 - round 0 593 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 594 595 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 596 mov d8, v4.d[1] @ GHASH block 4k - mid 597 598 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 599 mov d10, v17.d[1] @ GHASH block 4k - mid 600 601 aese q1, v19.16b 602 aesmc q1, q1 @ AES block 4k+5 - round 1 603 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 604 605 eor q8, q8, q4 @ GHASH block 4k - mid 606 607 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 608 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 609 610 aese q3, v19.16b 611 aesmc q3, q3 @ AES block 4k+7 - round 1 612 613 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 614 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 615 616 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 617 618 aese q0, v18.16b 619 aesmc q0, q0 @ AES block 4k+4 - round 0 620 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 621 622 aese q2, v18.16b 623 aesmc q2, q2 @ AES block 4k+6 - round 0 624 625 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 626 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 627 628 aese q0, v19.16b 629 aesmc q0, q0 @ AES block 4k+4 - round 1 630 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 631 632 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 633 634 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 635 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 636 637 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 638 639 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 640 641 aese q2, v19.16b 642 aesmc q2, q2 @ AES block 4k+6 - round 1 643 eor q9, q9, q8 @ GHASH block 4k+2 - high 644 645 aese q0, v20.16b 646 aesmc q0, q0 @ AES block 4k+4 - round 2 647 648 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 649 movi q8, #0xc2 650 651 aese q2, v20.16b 652 aesmc q2, q2 @ AES block 4k+6 - round 2 653 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 654 655 aese q3, v20.16b 656 aesmc q3, q3 @ AES block 4k+7 - round 2 657 658 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 659 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 660 661 aese q2, v21.16b 662 aesmc q2, q2 @ AES block 4k+6 - round 3 663 664 aese q1, v20.16b 665 aesmc q1, q1 @ AES block 4k+5 - round 2 666 eor q9, q9, q4 @ GHASH block 4k+3 - high 667 668 aese q0, v21.16b 669 aesmc q0, q0 @ AES block 4k+4 - round 3 670 671 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 672 shl d8, d8, #56 @ mod_constant 673 674 aese q1, v21.16b 675 aesmc q1, q1 @ AES block 4k+5 - round 3 676 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 677 678 aese q0, v22.16b 679 aesmc q0, q0 @ AES block 4k+4 - round 4 680 681 pmull v28.1q, q9, q8 682 eor v10.16b, v10.16b, q9 @ karatsuba tidy up 683 684 aese q1, v22.16b 685 aesmc q1, q1 @ AES block 4k+5 - round 4 686 687 aese q0, v23.16b 688 aesmc q0, q0 @ AES block 4k+4 - round 5 689 ext q9, q9, q9, #8 690 691 aese q3, v21.16b 692 aesmc q3, q3 @ AES block 4k+7 - round 3 693 694 aese q2, v22.16b 695 aesmc q2, q2 @ AES block 4k+6 - round 4 696 eor v10.16b, v10.16b, v11.16b 697 698 aese q0, v24.16b 699 aesmc q0, q0 @ AES block 4k+4 - round 6 700 701 aese q3, v22.16b 702 aesmc q3, q3 @ AES block 4k+7 - round 4 703 704 aese q1, v23.16b 705 aesmc q1, q1 @ AES block 4k+5 - round 5 706 707 aese q2, v23.16b 708 aesmc q2, q2 @ AES block 4k+6 - round 5 709 eor v10.16b, v10.16b, v28.16b 710 711 aese q3, v23.16b 712 aesmc q3, q3 @ AES block 4k+7 - round 5 713 714 aese q1, v24.16b 715 aesmc q1, q1 @ AES block 4k+5 - round 6 716 717 aese q2, v24.16b 718 aesmc q2, q2 @ AES block 4k+6 - round 6 719 720 aese q3, v24.16b 721 aesmc q3, q3 @ AES block 4k+7 - round 6 722 eor v10.16b, v10.16b, q9 723 724 aese q0, v25.16b 725 aesmc q0, q0 @ AES block 4k+4 - round 7 726 727 aese q2, v25.16b 728 aesmc q2, q2 @ AES block 4k+6 - round 7 729 730 aese q3, v25.16b 731 aesmc q3, q3 @ AES block 4k+7 - round 7 732 733 pmull v28.1q, v10.1d, q8 734 735 aese q1, v25.16b 736 aesmc q1, q1 @ AES block 4k+5 - round 7 737 ext v10.16b, v10.16b, v10.16b, #8 738 739 aese q3, v26.16b 740 aesmc q3, q3 @ AES block 4k+7 - round 8 741 742 aese q0, v26.16b 743 aesmc q0, q0 @ AES block 4k+4 - round 8 744 eor v11.16b, v11.16b, v28.16b 745 746 aese q1, v26.16b 747 aesmc q1, q1 @ AES block 4k+5 - round 8 748 749 aese q3, v27.16b @ AES block 4k+7 - round 9 750 751 aese q2, v26.16b 752 aesmc q2, q2 @ AES block 4k+6 - round 8 753 754 aese q0, v27.16b @ AES block 4k+4 - round 9 755 756 aese q1, v27.16b @ AES block 4k+5 - round 9 757 eor v11.16b, v11.16b, v10.16b 758 759 aese q2, v27.16b @ AES block 4k+6 - round 9 760 .L128_enc_tail:@ TAIL 761 762 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 763 ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext 764 #ifdef __ARMEB__ 765 rev r6, r6 766 rev r7, r7 767 #endif 768 cmp r5, #48 769 770 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 771 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 772 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 773 774 fmov d4, r6 @ AES block 4k+4 - mov low 775 776 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 777 778 eor q5, q4, q0 @ AES block 4k+4 - result 779 780 bgt .L128_enc_blocks_more_than_3 781 782 sub r12, r12, #1 783 movi v11.8b, #0 784 mov q3, q2 785 786 cmp r5, #32 787 mov q2, q1 788 movi q9, #0 789 790 movi v10.8b, #0 791 bgt .L128_enc_blocks_more_than_2 792 793 mov q3, q1 794 cmp r5, #16 795 796 sub r12, r12, #1 797 bgt .L128_enc_blocks_more_than_1 798 799 sub r12, r12, #1 800 b .L128_enc_blocks_less_than_1 801 .L128_enc_blocks_more_than_3:@ blocks left > 3 802 st1 { q5}, [r2], #16 @ AES final-3 block - store result 803 804 ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high 805 #ifdef __ARMEB__ 806 rev r6, r6 807 rev r7, r7 808 #endif 809 rev64 q4, q5 @ GHASH final-3 block 810 811 eor q4, q4, q8 @ feed in partial tag 812 eor r7, r7, r14 @ AES final-2 block - round 10 high 813 eor r6, r6, r13 @ AES final-2 block - round 10 low 814 815 fmov d5, r6 @ AES final-2 block - mov low 816 817 movi q8, #0 @ suppress further partial tag feed in 818 fmov v5.d[1], r7 @ AES final-2 block - mov high 819 820 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 821 mov d22, v4.d[1] @ GHASH final-3 block - mid 822 823 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 824 825 mov d10, v17.d[1] @ GHASH final-3 block - mid 826 827 eor q5, q5, q1 @ AES final-2 block - result 828 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 829 830 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 831 .L128_enc_blocks_more_than_2:@ blocks left > 2 832 833 st1 { q5}, [r2], #16 @ AES final-2 block - store result 834 835 rev64 q4, q5 @ GHASH final-2 block 836 ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high 837 #ifdef __ARMEB__ 838 rev r6, r6 839 rev r7, r7 840 #endif 841 eor q4, q4, q8 @ feed in partial tag 842 843 eor r6, r6, r13 @ AES final-1 block - round 10 low 844 845 fmov d5, r6 @ AES final-1 block - mov low 846 eor r7, r7, r14 @ AES final-1 block - round 10 high 847 848 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 849 fmov v5.d[1], r7 @ AES final-1 block - mov high 850 851 mov d22, v4.d[1] @ GHASH final-2 block - mid 852 853 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 854 855 eor q9, q9, v20.16b @ GHASH final-2 block - high 856 857 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 858 859 eor q5, q5, q2 @ AES final-1 block - result 860 861 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 862 863 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 864 865 movi q8, #0 @ suppress further partial tag feed in 866 867 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 868 .L128_enc_blocks_more_than_1:@ blocks left > 1 869 870 st1 { q5}, [r2], #16 @ AES final-1 block - store result 871 872 rev64 q4, q5 @ GHASH final-1 block 873 ldp r6, r7, [r0], #16 @ AES final block - load input low & high 874 #ifdef __ARMEB__ 875 rev r6, r6 876 rev r7, r7 877 #endif 878 eor q4, q4, q8 @ feed in partial tag 879 880 eor r7, r7, r14 @ AES final block - round 10 high 881 eor r6, r6, r13 @ AES final block - round 10 low 882 883 fmov d5, r6 @ AES final block - mov low 884 885 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 886 fmov v5.d[1], r7 @ AES final block - mov high 887 888 mov d22, v4.d[1] @ GHASH final-1 block - mid 889 890 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 891 892 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 893 894 eor q5, q5, q3 @ AES final block - result 895 896 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 897 898 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 899 900 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 901 902 eor q9, q9, v20.16b @ GHASH final-1 block - high 903 904 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 905 movi q8, #0 @ suppress further partial tag feed in 906 .L128_enc_blocks_less_than_1:@ blocks left <= 1 907 908 and r1, r1, #127 @ bit_length %= 128 909 mvn r13, xzr @ rk10_l = 0xffffffffffffffff 910 911 mvn r14, xzr @ rk10_h = 0xffffffffffffffff 912 sub r1, r1, #128 @ bit_length -= 128 913 914 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 915 916 and r1, r1, #127 @ bit_length %= 128 917 918 lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block 919 cmp r1, #64 920 921 csel r6, r13, r14, lt 922 csel r7, r14, xzr, lt 923 924 fmov d0, r6 @ ctr0b is mask for last block 925 926 fmov v0.d[1], r7 927 928 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 929 930 rev64 q4, q5 @ GHASH final block 931 932 eor q4, q4, q8 @ feed in partial tag 933 934 mov d8, v4.d[1] @ GHASH final block - mid 935 936 pmull v21.1q, q4, v12.1d @ GHASH final block - low 937 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored 938 939 eor q8, q8, q4 @ GHASH final block - mid 940 #ifndef __ARMEB__ 941 rev r9, r12 942 #else 943 mov r9, r12 944 #endif 945 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 946 947 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 948 949 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 950 951 eor q9, q9, v20.16b @ GHASH final block - high 952 953 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 954 movi q8, #0xc2 955 956 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 957 958 shl d8, d8, #56 @ mod_constant 959 960 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 961 962 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 963 964 ext q9, q9, q9, #8 @ MODULO - other top alignment 965 966 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 967 968 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 969 970 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 971 972 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 973 974 bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing 975 976 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 977 st1 { q5}, [r2] @ store all 16B 978 979 str r9, [r16, #12] @ store the updated counter 980 981 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 982 ext v11.16b, v11.16b, v11.16b, #8 983 rev64 v11.16b, v11.16b 984 mov r0, r15 985 st1 { v11.16b }, [r3] 986 ldp r21, r22, [sp, #16] 987 ldp r23, r24, [sp, #32] 988 ldp d8, d9, [sp, #48] 989 ldp d10, d11, [sp, #64] 990 ldp d12, d13, [sp, #80] 991 ldp d14, d15, [sp, #96] 992 ldp r19, r20, [sp], #112 993 RET 994 995 .L128_enc_ret: 996 mov r0, #0x0 997 RET 998 .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 999 .globl aes_gcm_dec_128_kernel 1000 .type aes_gcm_dec_128_kernel,%function 1001 .align 4 1002 aes_gcm_dec_128_kernel: 1003 AARCH64_VALID_CALL_TARGET 1004 cbz r1, .L128_dec_ret 1005 stp r19, r20, [sp, #-112]! 1006 mov r16, r4 1007 mov r8, r5 1008 stp r21, r22, [sp, #16] 1009 stp r23, r24, [sp, #32] 1010 stp d8, d9, [sp, #48] 1011 stp d10, d11, [sp, #64] 1012 stp d12, d13, [sp, #80] 1013 stp d14, d15, [sp, #96] 1014 1015 lsr r5, r1, #3 @ byte_len 1016 mov r15, r5 1017 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 1018 #ifdef __ARMEB__ 1019 rev r10, r10 1020 rev r11, r11 1021 #endif 1022 ldp r13, r14, [r8, #160] @ load rk10 1023 #ifdef __ARMEB__ 1024 ror r14, r14, 32 1025 ror r13, r13, 32 1026 #endif 1027 sub r5, r5, #1 @ byte_len - 1 1028 ld1 {v18.4s}, [r8], #16 @ load rk0 1029 1030 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1031 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 1032 1033 ldr q13, [r3, #64] @ load h2l | h2h 1034 #ifndef __ARMEB__ 1035 ext v13.16b, v13.16b, v13.16b, #8 1036 #endif 1037 lsr r12, r11, #32 1038 fmov d2, r10 @ CTR block 2 1039 1040 ld1 {v19.4s}, [r8], #16 @ load rk1 1041 orr r11, r11, r11 1042 rev r12, r12 @ rev_ctr32 1043 1044 fmov d1, r10 @ CTR block 1 1045 add r12, r12, #1 @ increment rev_ctr32 1046 1047 aese q0, v18.16b 1048 aesmc q0, q0 @ AES block 0 - round 0 1049 rev r9, r12 @ CTR block 1 1050 1051 orr r9, r11, r9, lsl #32 @ CTR block 1 1052 ld1 {v20.4s}, [r8], #16 @ load rk2 1053 add r12, r12, #1 @ CTR block 1 1054 1055 fmov v1.d[1], r9 @ CTR block 1 1056 rev r9, r12 @ CTR block 2 1057 add r12, r12, #1 @ CTR block 2 1058 1059 aese q0, v19.16b 1060 aesmc q0, q0 @ AES block 0 - round 1 1061 orr r9, r11, r9, lsl #32 @ CTR block 2 1062 1063 fmov v2.d[1], r9 @ CTR block 2 1064 rev r9, r12 @ CTR block 3 1065 1066 fmov d3, r10 @ CTR block 3 1067 orr r9, r11, r9, lsl #32 @ CTR block 3 1068 add r12, r12, #1 @ CTR block 3 1069 1070 fmov v3.d[1], r9 @ CTR block 3 1071 add r4, r0, r1, lsr #3 @ end_input_ptr 1072 1073 aese q1, v18.16b 1074 aesmc q1, q1 @ AES block 1 - round 0 1075 ld1 {v21.4s}, [r8], #16 @ load rk3 1076 1077 aese q0, v20.16b 1078 aesmc q0, q0 @ AES block 0 - round 2 1079 ld1 {v22.4s}, [r8], #16 @ load rk4 1080 1081 aese q2, v18.16b 1082 aesmc q2, q2 @ AES block 2 - round 0 1083 ld1 {v23.4s}, [r8], #16 @ load rk5 1084 1085 aese q1, v19.16b 1086 aesmc q1, q1 @ AES block 1 - round 1 1087 ld1 {v24.4s}, [r8], #16 @ load rk6 1088 1089 aese q3, v18.16b 1090 aesmc q3, q3 @ AES block 3 - round 0 1091 1092 aese q2, v19.16b 1093 aesmc q2, q2 @ AES block 2 - round 1 1094 1095 aese q1, v20.16b 1096 aesmc q1, q1 @ AES block 1 - round 2 1097 1098 aese q3, v19.16b 1099 aesmc q3, q3 @ AES block 3 - round 1 1100 ld1 { v11.16b}, [r3] 1101 ext v11.16b, v11.16b, v11.16b, #8 1102 rev64 v11.16b, v11.16b 1103 1104 aese q0, v21.16b 1105 aesmc q0, q0 @ AES block 0 - round 3 1106 ld1 {v25.4s}, [r8], #16 @ load rk7 1107 1108 aese q1, v21.16b 1109 aesmc q1, q1 @ AES block 1 - round 3 1110 1111 aese q3, v20.16b 1112 aesmc q3, q3 @ AES block 3 - round 2 1113 1114 aese q2, v20.16b 1115 aesmc q2, q2 @ AES block 2 - round 2 1116 ld1 {v26.4s}, [r8], #16 @ load rk8 1117 1118 aese q1, v22.16b 1119 aesmc q1, q1 @ AES block 1 - round 4 1120 1121 aese q3, v21.16b 1122 aesmc q3, q3 @ AES block 3 - round 3 1123 1124 aese q2, v21.16b 1125 aesmc q2, q2 @ AES block 2 - round 3 1126 ldr q14, [r3, #80] @ load h3l | h3h 1127 #ifndef __ARMEB__ 1128 ext v14.16b, v14.16b, v14.16b, #8 1129 #endif 1130 aese q0, v22.16b 1131 aesmc q0, q0 @ AES block 0 - round 4 1132 ld1 {v27.4s}, [r8], #16 @ load rk9 1133 1134 aese q1, v23.16b 1135 aesmc q1, q1 @ AES block 1 - round 5 1136 1137 aese q2, v22.16b 1138 aesmc q2, q2 @ AES block 2 - round 4 1139 1140 aese q3, v22.16b 1141 aesmc q3, q3 @ AES block 3 - round 4 1142 1143 aese q0, v23.16b 1144 aesmc q0, q0 @ AES block 0 - round 5 1145 1146 aese q2, v23.16b 1147 aesmc q2, q2 @ AES block 2 - round 5 1148 ldr q12, [r3, #32] @ load h1l | h1h 1149 #ifndef __ARMEB__ 1150 ext v12.16b, v12.16b, v12.16b, #8 1151 #endif 1152 aese q3, v23.16b 1153 aesmc q3, q3 @ AES block 3 - round 5 1154 1155 aese q0, v24.16b 1156 aesmc q0, q0 @ AES block 0 - round 6 1157 1158 aese q1, v24.16b 1159 aesmc q1, q1 @ AES block 1 - round 6 1160 1161 aese q3, v24.16b 1162 aesmc q3, q3 @ AES block 3 - round 6 1163 1164 aese q2, v24.16b 1165 aesmc q2, q2 @ AES block 2 - round 6 1166 trn1 q8, v12.2d, v13.2d @ h2h | h1h 1167 1168 ldr q15, [r3, #112] @ load h4l | h4h 1169 #ifndef __ARMEB__ 1170 ext v15.16b, v15.16b, v15.16b, #8 1171 #endif 1172 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 1173 add r5, r5, r0 1174 1175 aese q1, v25.16b 1176 aesmc q1, q1 @ AES block 1 - round 7 1177 1178 aese q2, v25.16b 1179 aesmc q2, q2 @ AES block 2 - round 7 1180 1181 aese q0, v25.16b 1182 aesmc q0, q0 @ AES block 0 - round 7 1183 eor v16.16b, v16.16b, q8 @ h2k | h1k 1184 1185 aese q3, v25.16b 1186 aesmc q3, q3 @ AES block 3 - round 7 1187 1188 aese q1, v26.16b 1189 aesmc q1, q1 @ AES block 1 - round 8 1190 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 1191 1192 aese q2, v26.16b 1193 aesmc q2, q2 @ AES block 2 - round 8 1194 1195 aese q3, v26.16b 1196 aesmc q3, q3 @ AES block 3 - round 8 1197 1198 aese q0, v26.16b 1199 aesmc q0, q0 @ AES block 0 - round 8 1200 trn1 q9, v14.2d, v15.2d @ h4h | h3h 1201 1202 aese q2, v27.16b @ AES block 2 - round 9 1203 1204 aese q3, v27.16b @ AES block 3 - round 9 1205 1206 aese q0, v27.16b @ AES block 0 - round 9 1207 cmp r0, r5 @ check if we have <= 4 blocks 1208 1209 aese q1, v27.16b @ AES block 1 - round 9 1210 eor v17.16b, v17.16b, q9 @ h4k | h3k 1211 bge .L128_dec_tail @ handle tail 1212 1213 ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext 1214 1215 eor q1, q5, q1 @ AES block 1 - result 1216 ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext 1217 1218 eor q0, q4, q0 @ AES block 0 - result 1219 rev64 q4, q4 @ GHASH block 0 1220 rev r9, r12 @ CTR block 4 1221 1222 orr r9, r11, r9, lsl #32 @ CTR block 4 1223 add r12, r12, #1 @ CTR block 4 1224 ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext 1225 1226 rev64 q5, q5 @ GHASH block 1 1227 mov r19, v1.d[0] @ AES block 1 - mov low 1228 1229 mov r20, v1.d[1] @ AES block 1 - mov high 1230 1231 mov r6, v0.d[0] @ AES block 0 - mov low 1232 cmp r0, r5 @ check if we have <= 8 blocks 1233 1234 mov r7, v0.d[1] @ AES block 0 - mov high 1235 1236 fmov d0, r10 @ CTR block 4 1237 1238 fmov v0.d[1], r9 @ CTR block 4 1239 rev r9, r12 @ CTR block 5 1240 eor r19, r19, r13 @ AES block 1 - round 10 low 1241 #ifdef __ARMEB__ 1242 rev r19, r19 1243 #endif 1244 fmov d1, r10 @ CTR block 5 1245 add r12, r12, #1 @ CTR block 5 1246 orr r9, r11, r9, lsl #32 @ CTR block 5 1247 1248 fmov v1.d[1], r9 @ CTR block 5 1249 rev r9, r12 @ CTR block 6 1250 add r12, r12, #1 @ CTR block 6 1251 1252 orr r9, r11, r9, lsl #32 @ CTR block 6 1253 1254 eor r20, r20, r14 @ AES block 1 - round 10 high 1255 #ifdef __ARMEB__ 1256 rev r20, r20 1257 #endif 1258 eor r6, r6, r13 @ AES block 0 - round 10 low 1259 #ifdef __ARMEB__ 1260 rev r6, r6 1261 #endif 1262 eor q2, q6, q2 @ AES block 2 - result 1263 1264 eor r7, r7, r14 @ AES block 0 - round 10 high 1265 #ifdef __ARMEB__ 1266 rev r7, r7 1267 #endif 1268 stp r6, r7, [r2], #16 @ AES block 0 - store result 1269 1270 stp r19, r20, [r2], #16 @ AES block 1 - store result 1271 bge .L128_dec_prepretail @ do prepretail 1272 1273 .L128_dec_main_loop:@ main loop start 1274 eor q3, q7, q3 @ AES block 4k+3 - result 1275 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 1276 mov r21, v2.d[0] @ AES block 4k+2 - mov low 1277 1278 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 1279 mov r22, v2.d[1] @ AES block 4k+2 - mov high 1280 1281 aese q1, v18.16b 1282 aesmc q1, q1 @ AES block 4k+5 - round 0 1283 fmov d2, r10 @ CTR block 4k+6 1284 1285 rev64 q6, q6 @ GHASH block 4k+2 1286 fmov v2.d[1], r9 @ CTR block 4k+6 1287 rev r9, r12 @ CTR block 4k+7 1288 1289 mov r23, v3.d[0] @ AES block 4k+3 - mov low 1290 eor q4, q4, v11.16b @ PRE 1 1291 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 1292 1293 aese q1, v19.16b 1294 aesmc q1, q1 @ AES block 4k+5 - round 1 1295 rev64 q7, q7 @ GHASH block 4k+3 1296 1297 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 1298 mov r24, v3.d[1] @ AES block 4k+3 - mov high 1299 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 1300 1301 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 1302 fmov d3, r10 @ CTR block 4k+7 1303 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 1304 1305 aese q1, v20.16b 1306 aesmc q1, q1 @ AES block 4k+5 - round 2 1307 fmov v3.d[1], r9 @ CTR block 4k+7 1308 1309 aese q2, v18.16b 1310 aesmc q2, q2 @ AES block 4k+6 - round 0 1311 mov d10, v17.d[1] @ GHASH block 4k - mid 1312 1313 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 1314 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 1315 1316 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 1317 1318 aese q1, v21.16b 1319 aesmc q1, q1 @ AES block 4k+5 - round 3 1320 mov d8, v4.d[1] @ GHASH block 4k - mid 1321 1322 aese q3, v18.16b 1323 aesmc q3, q3 @ AES block 4k+7 - round 0 1324 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 1325 1326 aese q0, v18.16b 1327 aesmc q0, q0 @ AES block 4k+4 - round 0 1328 1329 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 1330 eor q8, q8, q4 @ GHASH block 4k - mid 1331 1332 aese q3, v19.16b 1333 aesmc q3, q3 @ AES block 4k+7 - round 1 1334 eor r23, r23, r13 @ AES block 4k+3 - round 10 low 1335 #ifdef __ARMEB__ 1336 rev r23, r23 1337 #endif 1338 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 1339 eor r22, r22, r14 @ AES block 4k+2 - round 10 high 1340 #ifdef __ARMEB__ 1341 rev r22, r22 1342 #endif 1343 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 1344 1345 aese q0, v19.16b 1346 aesmc q0, q0 @ AES block 4k+4 - round 1 1347 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 1348 1349 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 1350 1351 aese q3, v20.16b 1352 aesmc q3, q3 @ AES block 4k+7 - round 2 1353 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 1354 1355 aese q0, v20.16b 1356 aesmc q0, q0 @ AES block 4k+4 - round 2 1357 1358 aese q1, v22.16b 1359 aesmc q1, q1 @ AES block 4k+5 - round 4 1360 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 1361 1362 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 1363 1364 aese q0, v21.16b 1365 aesmc q0, q0 @ AES block 4k+4 - round 3 1366 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 1367 1368 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 1369 1370 aese q2, v19.16b 1371 aesmc q2, q2 @ AES block 4k+6 - round 1 1372 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 1373 1374 aese q0, v22.16b 1375 aesmc q0, q0 @ AES block 4k+4 - round 4 1376 eor q9, q9, q8 @ GHASH block 4k+2 - high 1377 1378 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 1379 eor r24, r24, r14 @ AES block 4k+3 - round 10 high 1380 #ifdef __ARMEB__ 1381 rev r24, r24 1382 #endif 1383 aese q2, v20.16b 1384 aesmc q2, q2 @ AES block 4k+6 - round 2 1385 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 1386 1387 aese q1, v23.16b 1388 aesmc q1, q1 @ AES block 4k+5 - round 5 1389 eor r21, r21, r13 @ AES block 4k+2 - round 10 low 1390 #ifdef __ARMEB__ 1391 rev r21, r21 1392 #endif 1393 aese q0, v23.16b 1394 aesmc q0, q0 @ AES block 4k+4 - round 5 1395 movi q8, #0xc2 1396 1397 aese q2, v21.16b 1398 aesmc q2, q2 @ AES block 4k+6 - round 3 1399 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 1400 1401 aese q1, v24.16b 1402 aesmc q1, q1 @ AES block 4k+5 - round 6 1403 1404 aese q0, v24.16b 1405 aesmc q0, q0 @ AES block 4k+4 - round 6 1406 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 1407 1408 aese q2, v22.16b 1409 aesmc q2, q2 @ AES block 4k+6 - round 4 1410 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 1411 1412 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 1413 eor q9, q9, q4 @ GHASH block 4k+3 - high 1414 ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext 1415 1416 aese q1, v25.16b 1417 aesmc q1, q1 @ AES block 4k+5 - round 7 1418 add r12, r12, #1 @ CTR block 4k+7 1419 1420 aese q0, v25.16b 1421 aesmc q0, q0 @ AES block 4k+4 - round 7 1422 shl d8, d8, #56 @ mod_constant 1423 1424 aese q2, v23.16b 1425 aesmc q2, q2 @ AES block 4k+6 - round 5 1426 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 1427 1428 aese q1, v26.16b 1429 aesmc q1, q1 @ AES block 4k+5 - round 8 1430 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 1431 1432 aese q0, v26.16b 1433 aesmc q0, q0 @ AES block 4k+4 - round 8 1434 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 1435 1436 aese q3, v21.16b 1437 aesmc q3, q3 @ AES block 4k+7 - round 3 1438 rev r9, r12 @ CTR block 4k+8 1439 1440 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 1441 ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 1442 ext q9, q9, q9, #8 @ MODULO - other top alignment 1443 1444 aese q0, v27.16b @ AES block 4k+4 - round 9 1445 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 1446 1447 aese q3, v22.16b 1448 aesmc q3, q3 @ AES block 4k+7 - round 4 1449 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 1450 1451 aese q1, v27.16b @ AES block 4k+5 - round 9 1452 1453 aese q2, v24.16b 1454 aesmc q2, q2 @ AES block 4k+6 - round 6 1455 eor q0, q4, q0 @ AES block 4k+4 - result 1456 1457 aese q3, v23.16b 1458 aesmc q3, q3 @ AES block 4k+7 - round 5 1459 ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext 1460 1461 add r12, r12, #1 @ CTR block 4k+8 1462 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 1463 eor q1, q5, q1 @ AES block 4k+5 - result 1464 1465 aese q2, v25.16b 1466 aesmc q2, q2 @ AES block 4k+6 - round 7 1467 ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext 1468 1469 aese q3, v24.16b 1470 aesmc q3, q3 @ AES block 4k+7 - round 6 1471 1472 rev64 q5, q5 @ GHASH block 4k+5 1473 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 1474 mov r7, v0.d[1] @ AES block 4k+4 - mov high 1475 1476 aese q2, v26.16b 1477 aesmc q2, q2 @ AES block 4k+6 - round 8 1478 mov r6, v0.d[0] @ AES block 4k+4 - mov low 1479 1480 aese q3, v25.16b 1481 aesmc q3, q3 @ AES block 4k+7 - round 7 1482 fmov d0, r10 @ CTR block 4k+8 1483 1484 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 1485 fmov v0.d[1], r9 @ CTR block 4k+8 1486 rev r9, r12 @ CTR block 4k+9 1487 1488 aese q2, v27.16b @ AES block 4k+6 - round 9 1489 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 1490 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 1491 1492 aese q3, v26.16b 1493 aesmc q3, q3 @ AES block 4k+7 - round 8 1494 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 1495 #ifdef __ARMEB__ 1496 rev r7, r7 1497 #endif 1498 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 1499 mov r20, v1.d[1] @ AES block 4k+5 - mov high 1500 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 1501 #ifdef __ARMEB__ 1502 rev r6, r6 1503 #endif 1504 eor q2, q6, q2 @ AES block 4k+6 - result 1505 mov r19, v1.d[0] @ AES block 4k+5 - mov low 1506 add r12, r12, #1 @ CTR block 4k+9 1507 1508 aese q3, v27.16b @ AES block 4k+7 - round 9 1509 fmov d1, r10 @ CTR block 4k+9 1510 cmp r0, r5 @ .LOOP CONTROL 1511 1512 rev64 q4, q4 @ GHASH block 4k+4 1513 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 1514 fmov v1.d[1], r9 @ CTR block 4k+9 1515 1516 rev r9, r12 @ CTR block 4k+10 1517 add r12, r12, #1 @ CTR block 4k+10 1518 1519 eor r20, r20, r14 @ AES block 4k+5 - round 10 high 1520 #ifdef __ARMEB__ 1521 rev r20, r20 1522 #endif 1523 stp r6, r7, [r2], #16 @ AES block 4k+4 - store result 1524 1525 eor r19, r19, r13 @ AES block 4k+5 - round 10 low 1526 #ifdef __ARMEB__ 1527 rev r19, r19 1528 #endif 1529 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result 1530 1531 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 1532 blt .L128_dec_main_loop 1533 1534 .L128_dec_prepretail:@ PREPRETAIL 1535 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 1536 mov r21, v2.d[0] @ AES block 4k+2 - mov low 1537 mov d30, v5.d[1] @ GHASH block 4k+1 - mid 1538 1539 aese q0, v18.16b 1540 aesmc q0, q0 @ AES block 4k+4 - round 0 1541 eor q3, q7, q3 @ AES block 4k+3 - result 1542 1543 aese q1, v18.16b 1544 aesmc q1, q1 @ AES block 4k+5 - round 0 1545 mov r22, v2.d[1] @ AES block 4k+2 - mov high 1546 1547 eor q4, q4, v11.16b @ PRE 1 1548 fmov d2, r10 @ CTR block 4k+6 1549 rev64 q6, q6 @ GHASH block 4k+2 1550 1551 aese q0, v19.16b 1552 aesmc q0, q0 @ AES block 4k+4 - round 1 1553 fmov v2.d[1], r9 @ CTR block 4k+6 1554 1555 rev r9, r12 @ CTR block 4k+7 1556 mov r23, v3.d[0] @ AES block 4k+3 - mov low 1557 eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid 1558 1559 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 1560 mov d10, v17.d[1] @ GHASH block 4k - mid 1561 mov r24, v3.d[1] @ AES block 4k+3 - mov high 1562 1563 aese q1, v19.16b 1564 aesmc q1, q1 @ AES block 4k+5 - round 1 1565 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 1566 1567 aese q0, v20.16b 1568 aesmc q0, q0 @ AES block 4k+4 - round 2 1569 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 1570 1571 pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low 1572 mov d8, v4.d[1] @ GHASH block 4k - mid 1573 fmov d3, r10 @ CTR block 4k+7 1574 1575 aese q2, v18.16b 1576 aesmc q2, q2 @ AES block 4k+6 - round 0 1577 fmov v3.d[1], r9 @ CTR block 4k+7 1578 1579 pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid 1580 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 1581 1582 rev64 q7, q7 @ GHASH block 4k+3 1583 1584 aese q2, v19.16b 1585 aesmc q2, q2 @ AES block 4k+6 - round 1 1586 eor q8, q8, q4 @ GHASH block 4k - mid 1587 1588 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 1589 1590 aese q3, v18.16b 1591 aesmc q3, q3 @ AES block 4k+7 - round 0 1592 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 1593 1594 pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high 1595 1596 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 1597 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low 1598 1599 pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low 1600 1601 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 1602 eor q9, q9, v28.16b @ GHASH block 4k+1 - high 1603 1604 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid 1605 1606 pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high 1607 1608 pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high 1609 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 1610 1611 aese q1, v20.16b 1612 aesmc q1, q1 @ AES block 4k+5 - round 2 1613 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 1614 1615 pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low 1616 1617 eor q9, q9, q8 @ GHASH block 4k+2 - high 1618 movi q8, #0xc2 1619 1620 aese q3, v19.16b 1621 aesmc q3, q3 @ AES block 4k+7 - round 1 1622 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 1623 1624 eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low 1625 1626 aese q2, v20.16b 1627 aesmc q2, q2 @ AES block 4k+6 - round 2 1628 eor q9, q9, q4 @ GHASH block 4k+3 - high 1629 1630 aese q3, v20.16b 1631 aesmc q3, q3 @ AES block 4k+7 - round 2 1632 eor r23, r23, r13 @ AES block 4k+3 - round 10 low 1633 #ifdef __ARMEB__ 1634 rev r23, r23 1635 #endif 1636 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 1637 eor r21, r21, r13 @ AES block 4k+2 - round 10 low 1638 #ifdef __ARMEB__ 1639 rev r21, r21 1640 #endif 1641 eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low 1642 1643 aese q2, v21.16b 1644 aesmc q2, q2 @ AES block 4k+6 - round 3 1645 1646 aese q1, v21.16b 1647 aesmc q1, q1 @ AES block 4k+5 - round 3 1648 shl d8, d8, #56 @ mod_constant 1649 1650 aese q0, v21.16b 1651 aesmc q0, q0 @ AES block 4k+4 - round 3 1652 1653 aese q2, v22.16b 1654 aesmc q2, q2 @ AES block 4k+6 - round 4 1655 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 1656 1657 aese q1, v22.16b 1658 aesmc q1, q1 @ AES block 4k+5 - round 4 1659 1660 aese q3, v21.16b 1661 aesmc q3, q3 @ AES block 4k+7 - round 3 1662 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 1663 1664 aese q2, v23.16b 1665 aesmc q2, q2 @ AES block 4k+6 - round 5 1666 1667 aese q1, v23.16b 1668 aesmc q1, q1 @ AES block 4k+5 - round 5 1669 1670 aese q3, v22.16b 1671 aesmc q3, q3 @ AES block 4k+7 - round 4 1672 1673 aese q0, v22.16b 1674 aesmc q0, q0 @ AES block 4k+4 - round 4 1675 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 1676 1677 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 1678 1679 aese q1, v24.16b 1680 aesmc q1, q1 @ AES block 4k+5 - round 6 1681 ext q9, q9, q9, #8 @ MODULO - other top alignment 1682 1683 aese q3, v23.16b 1684 aesmc q3, q3 @ AES block 4k+7 - round 5 1685 1686 aese q0, v23.16b 1687 aesmc q0, q0 @ AES block 4k+4 - round 5 1688 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 1689 1690 aese q1, v25.16b 1691 aesmc q1, q1 @ AES block 4k+5 - round 7 1692 1693 aese q2, v24.16b 1694 aesmc q2, q2 @ AES block 4k+6 - round 6 1695 1696 aese q0, v24.16b 1697 aesmc q0, q0 @ AES block 4k+4 - round 6 1698 1699 aese q1, v26.16b 1700 aesmc q1, q1 @ AES block 4k+5 - round 8 1701 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 1702 1703 aese q3, v24.16b 1704 aesmc q3, q3 @ AES block 4k+7 - round 6 1705 1706 aese q0, v25.16b 1707 aesmc q0, q0 @ AES block 4k+4 - round 7 1708 1709 aese q1, v27.16b @ AES block 4k+5 - round 9 1710 1711 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 1712 eor r24, r24, r14 @ AES block 4k+3 - round 10 high 1713 #ifdef __ARMEB__ 1714 rev r24, r24 1715 #endif 1716 aese q2, v25.16b 1717 aesmc q2, q2 @ AES block 4k+6 - round 7 1718 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 1719 1720 aese q3, v25.16b 1721 aesmc q3, q3 @ AES block 4k+7 - round 7 1722 1723 aese q0, v26.16b 1724 aesmc q0, q0 @ AES block 4k+4 - round 8 1725 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 1726 1727 aese q2, v26.16b 1728 aesmc q2, q2 @ AES block 4k+6 - round 8 1729 1730 aese q3, v26.16b 1731 aesmc q3, q3 @ AES block 4k+7 - round 8 1732 eor r22, r22, r14 @ AES block 4k+2 - round 10 high 1733 #ifdef __ARMEB__ 1734 rev r22, r22 1735 #endif 1736 aese q0, v27.16b @ AES block 4k+4 - round 9 1737 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 1738 1739 aese q2, v27.16b @ AES block 4k+6 - round 9 1740 add r12, r12, #1 @ CTR block 4k+7 1741 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 1742 1743 aese q3, v27.16b @ AES block 4k+7 - round 9 1744 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 1745 .L128_dec_tail:@ TAIL 1746 1747 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 1748 ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 1749 1750 eor q0, q5, q0 @ AES block 4k+4 - result 1751 1752 mov r7, v0.d[1] @ AES block 4k+4 - mov high 1753 1754 mov r6, v0.d[0] @ AES block 4k+4 - mov low 1755 1756 cmp r5, #48 1757 1758 eor r7, r7, r14 @ AES block 4k+4 - round 10 high 1759 #ifdef __ARMEB__ 1760 rev r7, r7 1761 #endif 1762 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 1763 eor r6, r6, r13 @ AES block 4k+4 - round 10 low 1764 #ifdef __ARMEB__ 1765 rev r6, r6 1766 #endif 1767 bgt .L128_dec_blocks_more_than_3 1768 1769 mov q3, q2 1770 sub r12, r12, #1 1771 movi v11.8b, #0 1772 1773 movi q9, #0 1774 mov q2, q1 1775 1776 movi v10.8b, #0 1777 cmp r5, #32 1778 bgt .L128_dec_blocks_more_than_2 1779 1780 cmp r5, #16 1781 1782 mov q3, q1 1783 sub r12, r12, #1 1784 bgt .L128_dec_blocks_more_than_1 1785 1786 sub r12, r12, #1 1787 b .L128_dec_blocks_less_than_1 1788 .L128_dec_blocks_more_than_3:@ blocks left > 3 1789 rev64 q4, q5 @ GHASH final-3 block 1790 ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext 1791 1792 eor q4, q4, q8 @ feed in partial tag 1793 1794 mov d10, v17.d[1] @ GHASH final-3 block - mid 1795 stp r6, r7, [r2], #16 @ AES final-3 block - store result 1796 eor q0, q5, q1 @ AES final-2 block - result 1797 1798 mov d22, v4.d[1] @ GHASH final-3 block - mid 1799 mov r7, v0.d[1] @ AES final-2 block - mov high 1800 1801 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 1802 mov r6, v0.d[0] @ AES final-2 block - mov low 1803 1804 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 1805 1806 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 1807 1808 movi q8, #0 @ suppress further partial tag feed in 1809 eor r7, r7, r14 @ AES final-2 block - round 10 high 1810 #ifdef __ARMEB__ 1811 rev r7, r7 1812 #endif 1813 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 1814 eor r6, r6, r13 @ AES final-2 block - round 10 low 1815 #ifdef __ARMEB__ 1816 rev r6, r6 1817 #endif 1818 .L128_dec_blocks_more_than_2:@ blocks left > 2 1819 1820 rev64 q4, q5 @ GHASH final-2 block 1821 ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext 1822 1823 eor q4, q4, q8 @ feed in partial tag 1824 1825 eor q0, q5, q2 @ AES final-1 block - result 1826 stp r6, r7, [r2], #16 @ AES final-2 block - store result 1827 1828 mov d22, v4.d[1] @ GHASH final-2 block - mid 1829 1830 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 1831 1832 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 1833 mov r6, v0.d[0] @ AES final-1 block - mov low 1834 1835 mov r7, v0.d[1] @ AES final-1 block - mov high 1836 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 1837 1838 movi q8, #0 @ suppress further partial tag feed in 1839 1840 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 1841 1842 eor r6, r6, r13 @ AES final-1 block - round 10 low 1843 #ifdef __ARMEB__ 1844 rev r6, r6 1845 #endif 1846 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 1847 1848 eor q9, q9, v20.16b @ GHASH final-2 block - high 1849 1850 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 1851 eor r7, r7, r14 @ AES final-1 block - round 10 high 1852 #ifdef __ARMEB__ 1853 rev r7, r7 1854 #endif 1855 .L128_dec_blocks_more_than_1:@ blocks left > 1 1856 1857 rev64 q4, q5 @ GHASH final-1 block 1858 1859 ld1 { q5}, [r0], #16 @ AES final block - load ciphertext 1860 eor q4, q4, q8 @ feed in partial tag 1861 1862 mov d22, v4.d[1] @ GHASH final-1 block - mid 1863 1864 eor q0, q5, q3 @ AES final block - result 1865 1866 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 1867 1868 stp r6, r7, [r2], #16 @ AES final-1 block - store result 1869 mov r6, v0.d[0] @ AES final block - mov low 1870 1871 mov r7, v0.d[1] @ AES final block - mov high 1872 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 1873 1874 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 1875 1876 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 1877 1878 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 1879 movi q8, #0 @ suppress further partial tag feed in 1880 1881 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 1882 1883 eor q9, q9, v20.16b @ GHASH final-1 block - high 1884 eor r7, r7, r14 @ AES final block - round 10 high 1885 #ifdef __ARMEB__ 1886 rev r7, r7 1887 #endif 1888 eor r6, r6, r13 @ AES final block - round 10 low 1889 #ifdef __ARMEB__ 1890 rev r6, r6 1891 #endif 1892 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 1893 .L128_dec_blocks_less_than_1:@ blocks left <= 1 1894 1895 mvn r14, xzr @ rk10_h = 0xffffffffffffffff 1896 and r1, r1, #127 @ bit_length %= 128 1897 1898 mvn r13, xzr @ rk10_l = 0xffffffffffffffff 1899 sub r1, r1, #128 @ bit_length -= 128 1900 1901 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 1902 1903 and r1, r1, #127 @ bit_length %= 128 1904 1905 lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block 1906 cmp r1, #64 1907 1908 csel r10, r14, xzr, lt 1909 csel r9, r13, r14, lt 1910 1911 fmov d0, r9 @ ctr0b is mask for last block 1912 1913 mov v0.d[1], r10 1914 1915 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 1916 1917 rev64 q4, q5 @ GHASH final block 1918 1919 eor q4, q4, q8 @ feed in partial tag 1920 1921 ldp r4, r5, [r2] @ load existing bytes we need to not overwrite 1922 1923 and r7, r7, r10 1924 1925 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 1926 mov d8, v4.d[1] @ GHASH final block - mid 1927 1928 eor q8, q8, q4 @ GHASH final block - mid 1929 eor q9, q9, v20.16b @ GHASH final block - high 1930 1931 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 1932 1933 pmull v21.1q, q4, v12.1d @ GHASH final block - low 1934 bic r4, r4, r9 @ mask out low existing bytes 1935 and r6, r6, r9 1936 1937 #ifndef __ARMEB__ 1938 rev r9, r12 1939 #else 1940 mov r9, r12 1941 #endif 1942 1943 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 1944 movi q8, #0xc2 1945 1946 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 1947 1948 bic r5, r5, r10 @ mask out high existing bytes 1949 shl d8, d8, #56 @ mod_constant 1950 1951 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 1952 1953 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 1954 1955 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 1956 1957 orr r6, r6, r4 1958 str r9, [r16, #12] @ store the updated counter 1959 1960 orr r7, r7, r5 1961 stp r6, r7, [r2] 1962 ext q9, q9, q9, #8 @ MODULO - other top alignment 1963 1964 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 1965 1966 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 1967 1968 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 1969 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 1970 1971 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 1972 1973 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 1974 ext v11.16b, v11.16b, v11.16b, #8 1975 rev64 v11.16b, v11.16b 1976 mov r0, r15 1977 st1 { v11.16b }, [r3] 1978 1979 ldp r21, r22, [sp, #16] 1980 ldp r23, r24, [sp, #32] 1981 ldp d8, d9, [sp, #48] 1982 ldp d10, d11, [sp, #64] 1983 ldp d12, d13, [sp, #80] 1984 ldp d14, d15, [sp, #96] 1985 ldp r19, r20, [sp], #112 1986 RET 1987 1988 .L128_dec_ret: 1989 mov r0, #0x0 1990 RET 1991 .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 1992 .globl aes_gcm_enc_192_kernel 1993 .type aes_gcm_enc_192_kernel,%function 1994 .align 4 1995 aes_gcm_enc_192_kernel: 1996 AARCH64_VALID_CALL_TARGET 1997 cbz r1, .L192_enc_ret 1998 stp r19, r20, [sp, #-112]! 1999 mov r16, r4 2000 mov r8, r5 2001 stp r21, r22, [sp, #16] 2002 stp r23, r24, [sp, #32] 2003 stp d8, d9, [sp, #48] 2004 stp d10, d11, [sp, #64] 2005 stp d12, d13, [sp, #80] 2006 stp d14, d15, [sp, #96] 2007 2008 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 2009 #ifdef __ARMEB__ 2010 rev r10, r10 2011 rev r11, r11 2012 #endif 2013 ldp r13, r14, [r8, #192] @ load rk12 2014 #ifdef __ARMEB__ 2015 ror r13, r13, #32 2016 ror r14, r14, #32 2017 #endif 2018 ld1 {v18.4s}, [r8], #16 @ load rk0 2019 2020 ld1 {v19.4s}, [r8], #16 @ load rk1 2021 2022 ld1 {v20.4s}, [r8], #16 @ load rk2 2023 2024 lsr r12, r11, #32 2025 ld1 {v21.4s}, [r8], #16 @ load rk3 2026 orr r11, r11, r11 2027 2028 ld1 {v22.4s}, [r8], #16 @ load rk4 2029 rev r12, r12 @ rev_ctr32 2030 2031 add r12, r12, #1 @ increment rev_ctr32 2032 fmov d3, r10 @ CTR block 3 2033 2034 rev r9, r12 @ CTR block 1 2035 add r12, r12, #1 @ CTR block 1 2036 fmov d1, r10 @ CTR block 1 2037 2038 orr r9, r11, r9, lsl #32 @ CTR block 1 2039 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 2040 2041 fmov v1.d[1], r9 @ CTR block 1 2042 rev r9, r12 @ CTR block 2 2043 add r12, r12, #1 @ CTR block 2 2044 2045 fmov d2, r10 @ CTR block 2 2046 orr r9, r11, r9, lsl #32 @ CTR block 2 2047 2048 fmov v2.d[1], r9 @ CTR block 2 2049 rev r9, r12 @ CTR block 3 2050 2051 orr r9, r11, r9, lsl #32 @ CTR block 3 2052 ld1 {v23.4s}, [r8], #16 @ load rk5 2053 2054 fmov v3.d[1], r9 @ CTR block 3 2055 2056 ld1 {v24.4s}, [r8], #16 @ load rk6 2057 2058 ld1 {v25.4s}, [r8], #16 @ load rk7 2059 2060 aese q0, v18.16b 2061 aesmc q0, q0 @ AES block 0 - round 0 2062 ld1 { v11.16b}, [r3] 2063 ext v11.16b, v11.16b, v11.16b, #8 2064 rev64 v11.16b, v11.16b 2065 2066 aese q3, v18.16b 2067 aesmc q3, q3 @ AES block 3 - round 0 2068 ld1 {v26.4s}, [r8], #16 @ load rk8 2069 2070 aese q1, v18.16b 2071 aesmc q1, q1 @ AES block 1 - round 0 2072 ldr q15, [r3, #112] @ load h4l | h4h 2073 #ifndef __ARMEB__ 2074 ext v15.16b, v15.16b, v15.16b, #8 2075 #endif 2076 aese q2, v18.16b 2077 aesmc q2, q2 @ AES block 2 - round 0 2078 ld1 {v27.4s}, [r8], #16 @ load rk9 2079 2080 aese q0, v19.16b 2081 aesmc q0, q0 @ AES block 0 - round 1 2082 ld1 {v28.4s}, [r8], #16 @ load rk10 2083 2084 aese q1, v19.16b 2085 aesmc q1, q1 @ AES block 1 - round 1 2086 ldr q12, [r3, #32] @ load h1l | h1h 2087 #ifndef __ARMEB__ 2088 ext v12.16b, v12.16b, v12.16b, #8 2089 #endif 2090 aese q2, v19.16b 2091 aesmc q2, q2 @ AES block 2 - round 1 2092 ld1 {v29.4s}, [r8], #16 @ load rk11 2093 2094 aese q3, v19.16b 2095 aesmc q3, q3 @ AES block 3 - round 1 2096 ldr q14, [r3, #80] @ load h3l | h3h 2097 #ifndef __ARMEB__ 2098 ext v14.16b, v14.16b, v14.16b, #8 2099 #endif 2100 aese q0, v20.16b 2101 aesmc q0, q0 @ AES block 0 - round 2 2102 2103 aese q2, v20.16b 2104 aesmc q2, q2 @ AES block 2 - round 2 2105 2106 aese q3, v20.16b 2107 aesmc q3, q3 @ AES block 3 - round 2 2108 2109 aese q0, v21.16b 2110 aesmc q0, q0 @ AES block 0 - round 3 2111 trn1 q9, v14.2d, v15.2d @ h4h | h3h 2112 2113 aese q2, v21.16b 2114 aesmc q2, q2 @ AES block 2 - round 3 2115 2116 aese q1, v20.16b 2117 aesmc q1, q1 @ AES block 1 - round 2 2118 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 2119 2120 aese q0, v22.16b 2121 aesmc q0, q0 @ AES block 0 - round 4 2122 2123 aese q3, v21.16b 2124 aesmc q3, q3 @ AES block 3 - round 3 2125 2126 aese q1, v21.16b 2127 aesmc q1, q1 @ AES block 1 - round 3 2128 2129 aese q0, v23.16b 2130 aesmc q0, q0 @ AES block 0 - round 5 2131 2132 aese q2, v22.16b 2133 aesmc q2, q2 @ AES block 2 - round 4 2134 2135 aese q1, v22.16b 2136 aesmc q1, q1 @ AES block 1 - round 4 2137 2138 aese q0, v24.16b 2139 aesmc q0, q0 @ AES block 0 - round 6 2140 2141 aese q3, v22.16b 2142 aesmc q3, q3 @ AES block 3 - round 4 2143 2144 aese q2, v23.16b 2145 aesmc q2, q2 @ AES block 2 - round 5 2146 2147 aese q1, v23.16b 2148 aesmc q1, q1 @ AES block 1 - round 5 2149 2150 aese q3, v23.16b 2151 aesmc q3, q3 @ AES block 3 - round 5 2152 2153 aese q2, v24.16b 2154 aesmc q2, q2 @ AES block 2 - round 6 2155 ldr q13, [r3, #64] @ load h2l | h2h 2156 #ifndef __ARMEB__ 2157 ext v13.16b, v13.16b, v13.16b, #8 2158 #endif 2159 aese q1, v24.16b 2160 aesmc q1, q1 @ AES block 1 - round 6 2161 2162 aese q3, v24.16b 2163 aesmc q3, q3 @ AES block 3 - round 6 2164 2165 aese q0, v25.16b 2166 aesmc q0, q0 @ AES block 0 - round 7 2167 2168 aese q1, v25.16b 2169 aesmc q1, q1 @ AES block 1 - round 7 2170 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 2171 2172 aese q3, v25.16b 2173 aesmc q3, q3 @ AES block 3 - round 7 2174 2175 aese q0, v26.16b 2176 aesmc q0, q0 @ AES block 0 - round 8 2177 2178 aese q2, v25.16b 2179 aesmc q2, q2 @ AES block 2 - round 7 2180 trn1 q8, v12.2d, v13.2d @ h2h | h1h 2181 2182 aese q1, v26.16b 2183 aesmc q1, q1 @ AES block 1 - round 8 2184 2185 aese q3, v26.16b 2186 aesmc q3, q3 @ AES block 3 - round 8 2187 2188 aese q2, v26.16b 2189 aesmc q2, q2 @ AES block 2 - round 8 2190 2191 aese q0, v27.16b 2192 aesmc q0, q0 @ AES block 0 - round 9 2193 2194 aese q3, v27.16b 2195 aesmc q3, q3 @ AES block 3 - round 9 2196 2197 aese q2, v27.16b 2198 aesmc q2, q2 @ AES block 2 - round 9 2199 2200 aese q1, v27.16b 2201 aesmc q1, q1 @ AES block 1 - round 9 2202 2203 aese q0, v28.16b 2204 aesmc q0, q0 @ AES block 0 - round 10 2205 2206 aese q2, v28.16b 2207 aesmc q2, q2 @ AES block 2 - round 10 2208 2209 aese q1, v28.16b 2210 aesmc q1, q1 @ AES block 1 - round 10 2211 lsr r5, r1, #3 @ byte_len 2212 mov r15, r5 2213 2214 aese q3, v28.16b 2215 aesmc q3, q3 @ AES block 3 - round 10 2216 sub r5, r5, #1 @ byte_len - 1 2217 2218 eor v16.16b, v16.16b, q8 @ h2k | h1k 2219 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2220 2221 eor v17.16b, v17.16b, q9 @ h4k | h3k 2222 2223 aese q2, v29.16b @ AES block 2 - round 11 2224 add r4, r0, r1, lsr #3 @ end_input_ptr 2225 add r5, r5, r0 2226 2227 aese q1, v29.16b @ AES block 1 - round 11 2228 cmp r0, r5 @ check if we have <= 4 blocks 2229 2230 aese q0, v29.16b @ AES block 0 - round 11 2231 add r12, r12, #1 @ CTR block 3 2232 2233 aese q3, v29.16b @ AES block 3 - round 11 2234 bge .L192_enc_tail @ handle tail 2235 2236 rev r9, r12 @ CTR block 4 2237 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext 2238 #ifdef __ARMEB__ 2239 rev r6, r6 2240 rev r7, r7 2241 #endif 2242 orr r9, r11, r9, lsl #32 @ CTR block 4 2243 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext 2244 #ifdef __ARMEB__ 2245 rev r21, r21 2246 rev r22, r22 2247 #endif 2248 ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext 2249 #ifdef __ARMEB__ 2250 rev r23, r23 2251 rev r24, r24 2252 #endif 2253 ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext 2254 #ifdef __ARMEB__ 2255 rev r19, r19 2256 rev r20, r20 2257 #endif 2258 add r0, r0, #64 @ AES input_ptr update 2259 cmp r0, r5 @ check if we have <= 8 blocks 2260 2261 eor r6, r6, r13 @ AES block 0 - round 12 low 2262 2263 eor r7, r7, r14 @ AES block 0 - round 12 high 2264 eor r22, r22, r14 @ AES block 2 - round 12 high 2265 fmov d4, r6 @ AES block 0 - mov low 2266 2267 eor r24, r24, r14 @ AES block 3 - round 12 high 2268 fmov v4.d[1], r7 @ AES block 0 - mov high 2269 2270 eor r21, r21, r13 @ AES block 2 - round 12 low 2271 eor r19, r19, r13 @ AES block 1 - round 12 low 2272 2273 fmov d5, r19 @ AES block 1 - mov low 2274 eor r20, r20, r14 @ AES block 1 - round 12 high 2275 2276 fmov v5.d[1], r20 @ AES block 1 - mov high 2277 2278 eor r23, r23, r13 @ AES block 3 - round 12 low 2279 fmov d6, r21 @ AES block 2 - mov low 2280 2281 add r12, r12, #1 @ CTR block 4 2282 eor q4, q4, q0 @ AES block 0 - result 2283 fmov d0, r10 @ CTR block 4 2284 2285 fmov v0.d[1], r9 @ CTR block 4 2286 rev r9, r12 @ CTR block 5 2287 2288 orr r9, r11, r9, lsl #32 @ CTR block 5 2289 add r12, r12, #1 @ CTR block 5 2290 2291 fmov d7, r23 @ AES block 3 - mov low 2292 st1 { q4}, [r2], #16 @ AES block 0 - store result 2293 2294 fmov v6.d[1], r22 @ AES block 2 - mov high 2295 2296 eor q5, q5, q1 @ AES block 1 - result 2297 fmov d1, r10 @ CTR block 5 2298 st1 { q5}, [r2], #16 @ AES block 1 - store result 2299 2300 fmov v7.d[1], r24 @ AES block 3 - mov high 2301 2302 fmov v1.d[1], r9 @ CTR block 5 2303 rev r9, r12 @ CTR block 6 2304 2305 orr r9, r11, r9, lsl #32 @ CTR block 6 2306 2307 add r12, r12, #1 @ CTR block 6 2308 eor q6, q6, q2 @ AES block 2 - result 2309 fmov d2, r10 @ CTR block 6 2310 2311 fmov v2.d[1], r9 @ CTR block 6 2312 rev r9, r12 @ CTR block 7 2313 2314 orr r9, r11, r9, lsl #32 @ CTR block 7 2315 st1 { q6}, [r2], #16 @ AES block 2 - store result 2316 2317 eor q7, q7, q3 @ AES block 3 - result 2318 st1 { q7}, [r2], #16 @ AES block 3 - store result 2319 bge .L192_enc_prepretail @ do prepretail 2320 2321 .L192_enc_main_loop:@ main loop start 2322 aese q2, v18.16b 2323 aesmc q2, q2 @ AES block 4k+6 - round 0 2324 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 2325 2326 aese q1, v18.16b 2327 aesmc q1, q1 @ AES block 4k+5 - round 0 2328 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext 2329 #ifdef __ARMEB__ 2330 rev r19, r19 2331 rev r20, r20 2332 #endif 2333 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 2334 fmov d3, r10 @ CTR block 4k+3 2335 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 2336 2337 aese q2, v19.16b 2338 aesmc q2, q2 @ AES block 4k+6 - round 1 2339 fmov v3.d[1], r9 @ CTR block 4k+3 2340 2341 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 2342 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2343 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext 2344 #ifdef __ARMEB__ 2345 rev r21, r21 2346 rev r22, r22 2347 #endif 2348 aese q0, v18.16b 2349 aesmc q0, q0 @ AES block 4k+4 - round 0 2350 ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext 2351 #ifdef __ARMEB__ 2352 rev r23, r23 2353 rev r24, r24 2354 #endif 2355 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 2356 eor q4, q4, v11.16b @ PRE 1 2357 2358 aese q1, v19.16b 2359 aesmc q1, q1 @ AES block 4k+5 - round 1 2360 2361 aese q0, v19.16b 2362 aesmc q0, q0 @ AES block 4k+4 - round 1 2363 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 2364 2365 aese q3, v18.16b 2366 aesmc q3, q3 @ AES block 4k+7 - round 0 2367 eor r24, r24, r14 @ AES block 4k+3 - round 12 high 2368 2369 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 2370 mov d8, v4.d[1] @ GHASH block 4k - mid 2371 2372 aese q0, v20.16b 2373 aesmc q0, q0 @ AES block 4k+4 - round 2 2374 2375 aese q3, v19.16b 2376 aesmc q3, q3 @ AES block 4k+7 - round 1 2377 eor r21, r21, r13 @ AES block 4k+6 - round 12 low 2378 2379 eor q8, q8, q4 @ GHASH block 4k - mid 2380 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 2381 2382 aese q0, v21.16b 2383 aesmc q0, q0 @ AES block 4k+4 - round 3 2384 eor r19, r19, r13 @ AES block 4k+5 - round 12 low 2385 2386 aese q1, v20.16b 2387 aesmc q1, q1 @ AES block 4k+5 - round 2 2388 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 2389 2390 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 2391 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 2392 2393 aese q2, v20.16b 2394 aesmc q2, q2 @ AES block 4k+6 - round 2 2395 2396 aese q1, v21.16b 2397 aesmc q1, q1 @ AES block 4k+5 - round 3 2398 2399 mov d10, v17.d[1] @ GHASH block 4k - mid 2400 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 2401 2402 aese q3, v20.16b 2403 aesmc q3, q3 @ AES block 4k+7 - round 2 2404 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 2405 2406 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 2407 2408 aese q0, v22.16b 2409 aesmc q0, q0 @ AES block 4k+4 - round 4 2410 eor q4, q4, q5 @ GHASH block 4k+1 - mid 2411 2412 aese q3, v21.16b 2413 aesmc q3, q3 @ AES block 4k+7 - round 3 2414 2415 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 2416 eor r20, r20, r14 @ AES block 4k+5 - round 12 high 2417 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 2418 2419 aese q0, v23.16b 2420 aesmc q0, q0 @ AES block 4k+4 - round 5 2421 add r12, r12, #1 @ CTR block 4k+3 2422 2423 aese q3, v22.16b 2424 aesmc q3, q3 @ AES block 4k+7 - round 4 2425 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 2426 2427 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 2428 eor r22, r22, r14 @ AES block 4k+6 - round 12 high 2429 2430 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 2431 eor r23, r23, r13 @ AES block 4k+3 - round 12 low 2432 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 2433 2434 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 2435 rev r9, r12 @ CTR block 4k+8 2436 2437 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 2438 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 2439 2440 aese q2, v21.16b 2441 aesmc q2, q2 @ AES block 4k+6 - round 3 2442 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 2443 2444 aese q1, v22.16b 2445 aesmc q1, q1 @ AES block 4k+5 - round 4 2446 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext 2447 #ifdef __ARMEB__ 2448 rev r6, r6 2449 rev r7, r7 2450 #endif 2451 aese q0, v24.16b 2452 aesmc q0, q0 @ AES block 4k+4 - round 6 2453 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 2454 2455 aese q2, v22.16b 2456 aesmc q2, q2 @ AES block 4k+6 - round 4 2457 add r0, r0, #64 @ AES input_ptr update 2458 2459 aese q1, v23.16b 2460 aesmc q1, q1 @ AES block 4k+5 - round 5 2461 movi q8, #0xc2 2462 2463 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 2464 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 2465 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 2466 2467 aese q2, v23.16b 2468 aesmc q2, q2 @ AES block 4k+6 - round 5 2469 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 2470 2471 aese q1, v24.16b 2472 aesmc q1, q1 @ AES block 4k+5 - round 6 2473 shl d8, d8, #56 @ mod_constant 2474 2475 aese q3, v23.16b 2476 aesmc q3, q3 @ AES block 4k+7 - round 5 2477 eor q9, q9, q5 @ GHASH block 4k+3 - high 2478 2479 aese q0, v25.16b 2480 aesmc q0, q0 @ AES block 4k+4 - round 7 2481 fmov d5, r19 @ AES block 4k+5 - mov low 2482 2483 aese q1, v25.16b 2484 aesmc q1, q1 @ AES block 4k+5 - round 7 2485 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 2486 2487 aese q3, v24.16b 2488 aesmc q3, q3 @ AES block 4k+7 - round 6 2489 fmov v5.d[1], r20 @ AES block 4k+5 - mov high 2490 2491 aese q0, v26.16b 2492 aesmc q0, q0 @ AES block 4k+4 - round 8 2493 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 2494 2495 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 2496 cmp r0, r5 @ .LOOP CONTROL 2497 fmov d4, r6 @ AES block 4k+4 - mov low 2498 2499 aese q2, v24.16b 2500 aesmc q2, q2 @ AES block 4k+6 - round 6 2501 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 2502 2503 aese q1, v26.16b 2504 aesmc q1, q1 @ AES block 4k+5 - round 8 2505 fmov d7, r23 @ AES block 4k+3 - mov low 2506 2507 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 2508 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 2509 add r12, r12, #1 @ CTR block 4k+8 2510 2511 aese q2, v25.16b 2512 aesmc q2, q2 @ AES block 4k+6 - round 7 2513 fmov v7.d[1], r24 @ AES block 4k+3 - mov high 2514 2515 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 2516 ext q9, q9, q9, #8 @ MODULO - other top alignment 2517 fmov d6, r21 @ AES block 4k+6 - mov low 2518 2519 aese q3, v25.16b 2520 aesmc q3, q3 @ AES block 4k+7 - round 7 2521 2522 aese q0, v27.16b 2523 aesmc q0, q0 @ AES block 4k+4 - round 9 2524 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 2525 2526 aese q2, v26.16b 2527 aesmc q2, q2 @ AES block 4k+6 - round 8 2528 2529 aese q3, v26.16b 2530 aesmc q3, q3 @ AES block 4k+7 - round 8 2531 2532 aese q1, v27.16b 2533 aesmc q1, q1 @ AES block 4k+5 - round 9 2534 2535 aese q0, v28.16b 2536 aesmc q0, q0 @ AES block 4k+4 - round 10 2537 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 2538 2539 aese q3, v27.16b 2540 aesmc q3, q3 @ AES block 4k+7 - round 9 2541 2542 aese q2, v27.16b 2543 aesmc q2, q2 @ AES block 4k+6 - round 9 2544 2545 aese q0, v29.16b @ AES block 4k+4 - round 11 2546 2547 aese q1, v28.16b 2548 aesmc q1, q1 @ AES block 4k+5 - round 10 2549 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 2550 2551 aese q2, v28.16b 2552 aesmc q2, q2 @ AES block 4k+6 - round 10 2553 2554 eor q4, q4, q0 @ AES block 4k+4 - result 2555 fmov d0, r10 @ CTR block 4k+8 2556 2557 aese q1, v29.16b @ AES block 4k+5 - round 11 2558 fmov v0.d[1], r9 @ CTR block 4k+8 2559 rev r9, r12 @ CTR block 4k+9 2560 2561 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 2562 fmov v6.d[1], r22 @ AES block 4k+6 - mov high 2563 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result 2564 2565 aese q3, v28.16b 2566 aesmc q3, q3 @ AES block 4k+7 - round 10 2567 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 2568 2569 eor q5, q5, q1 @ AES block 4k+5 - result 2570 add r12, r12, #1 @ CTR block 4k+9 2571 fmov d1, r10 @ CTR block 4k+9 2572 2573 aese q2, v29.16b @ AES block 4k+6 - round 11 2574 fmov v1.d[1], r9 @ CTR block 4k+9 2575 rev r9, r12 @ CTR block 4k+10 2576 2577 add r12, r12, #1 @ CTR block 4k+10 2578 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 2579 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 2580 2581 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result 2582 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 2583 2584 aese q3, v29.16b @ AES block 4k+7 - round 11 2585 eor q6, q6, q2 @ AES block 4k+6 - result 2586 fmov d2, r10 @ CTR block 4k+10 2587 2588 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result 2589 fmov v2.d[1], r9 @ CTR block 4k+10 2590 rev r9, r12 @ CTR block 4k+11 2591 2592 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 2593 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 2594 2595 eor q7, q7, q3 @ AES block 4k+3 - result 2596 st1 { q7}, [r2], #16 @ AES block 4k+3 - store result 2597 blt .L192_enc_main_loop 2598 2599 .L192_enc_prepretail:@ PREPRETAIL 2600 aese q0, v18.16b 2601 aesmc q0, q0 @ AES block 4k+4 - round 0 2602 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 2603 2604 fmov d3, r10 @ CTR block 4k+3 2605 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 2606 add r12, r12, #1 @ CTR block 4k+3 2607 2608 aese q1, v18.16b 2609 aesmc q1, q1 @ AES block 4k+5 - round 0 2610 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 2611 2612 aese q2, v18.16b 2613 aesmc q2, q2 @ AES block 4k+6 - round 0 2614 2615 fmov v3.d[1], r9 @ CTR block 4k+3 2616 eor q4, q4, v11.16b @ PRE 1 2617 mov d10, v17.d[1] @ GHASH block 4k - mid 2618 2619 aese q1, v19.16b 2620 aesmc q1, q1 @ AES block 4k+5 - round 1 2621 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 2622 2623 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 2624 2625 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 2626 mov d8, v4.d[1] @ GHASH block 4k - mid 2627 2628 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 2629 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2630 2631 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 2632 2633 eor q8, q8, q4 @ GHASH block 4k - mid 2634 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 2635 2636 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 2637 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 2638 2639 aese q3, v18.16b 2640 aesmc q3, q3 @ AES block 4k+7 - round 0 2641 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 2642 2643 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 2644 2645 eor q4, q4, q5 @ GHASH block 4k+1 - mid 2646 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 2647 2648 aese q3, v19.16b 2649 aesmc q3, q3 @ AES block 4k+7 - round 1 2650 2651 aese q2, v19.16b 2652 aesmc q2, q2 @ AES block 4k+6 - round 1 2653 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 2654 2655 aese q0, v19.16b 2656 aesmc q0, q0 @ AES block 4k+4 - round 1 2657 2658 aese q1, v20.16b 2659 aesmc q1, q1 @ AES block 4k+5 - round 2 2660 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 2661 2662 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 2663 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 2664 2665 aese q0, v20.16b 2666 aesmc q0, q0 @ AES block 4k+4 - round 2 2667 2668 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 2669 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 2670 2671 aese q1, v21.16b 2672 aesmc q1, q1 @ AES block 4k+5 - round 3 2673 2674 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 2675 2676 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 2677 2678 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 2679 eor q9, q9, q5 @ GHASH block 4k+3 - high 2680 2681 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 2682 2683 aese q0, v21.16b 2684 aesmc q0, q0 @ AES block 4k+4 - round 3 2685 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 2686 2687 aese q3, v20.16b 2688 aesmc q3, q3 @ AES block 4k+7 - round 2 2689 2690 aese q2, v20.16b 2691 aesmc q2, q2 @ AES block 4k+6 - round 2 2692 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 2693 2694 aese q0, v22.16b 2695 aesmc q0, q0 @ AES block 4k+4 - round 4 2696 2697 aese q3, v21.16b 2698 aesmc q3, q3 @ AES block 4k+7 - round 3 2699 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 2700 2701 aese q2, v21.16b 2702 aesmc q2, q2 @ AES block 4k+6 - round 3 2703 2704 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 2705 movi q8, #0xc2 2706 2707 aese q3, v22.16b 2708 aesmc q3, q3 @ AES block 4k+7 - round 4 2709 2710 aese q2, v22.16b 2711 aesmc q2, q2 @ AES block 4k+6 - round 4 2712 2713 aese q1, v22.16b 2714 aesmc q1, q1 @ AES block 4k+5 - round 4 2715 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 2716 2717 aese q3, v23.16b 2718 aesmc q3, q3 @ AES block 4k+7 - round 5 2719 2720 aese q2, v23.16b 2721 aesmc q2, q2 @ AES block 4k+6 - round 5 2722 2723 aese q1, v23.16b 2724 aesmc q1, q1 @ AES block 4k+5 - round 5 2725 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 2726 2727 aese q0, v23.16b 2728 aesmc q0, q0 @ AES block 4k+4 - round 5 2729 2730 aese q3, v24.16b 2731 aesmc q3, q3 @ AES block 4k+7 - round 6 2732 eor v10.16b, v10.16b, q9 @ karatsuba tidy up 2733 2734 aese q1, v24.16b 2735 aesmc q1, q1 @ AES block 4k+5 - round 6 2736 2737 aese q0, v24.16b 2738 aesmc q0, q0 @ AES block 4k+4 - round 6 2739 shl d8, d8, #56 @ mod_constant 2740 2741 aese q3, v25.16b 2742 aesmc q3, q3 @ AES block 4k+7 - round 7 2743 2744 aese q1, v25.16b 2745 aesmc q1, q1 @ AES block 4k+5 - round 7 2746 eor v10.16b, v10.16b, v11.16b 2747 2748 aese q0, v25.16b 2749 aesmc q0, q0 @ AES block 4k+4 - round 7 2750 2751 pmull v30.1q, q9, q8 2752 2753 aese q2, v24.16b 2754 aesmc q2, q2 @ AES block 4k+6 - round 6 2755 ext q9, q9, q9, #8 2756 2757 aese q0, v26.16b 2758 aesmc q0, q0 @ AES block 4k+4 - round 8 2759 2760 aese q1, v26.16b 2761 aesmc q1, q1 @ AES block 4k+5 - round 8 2762 eor v10.16b, v10.16b, v30.16b 2763 2764 aese q2, v25.16b 2765 aesmc q2, q2 @ AES block 4k+6 - round 7 2766 2767 aese q3, v26.16b 2768 aesmc q3, q3 @ AES block 4k+7 - round 8 2769 2770 aese q0, v27.16b 2771 aesmc q0, q0 @ AES block 4k+4 - round 9 2772 2773 aese q2, v26.16b 2774 aesmc q2, q2 @ AES block 4k+6 - round 8 2775 eor v10.16b, v10.16b, q9 2776 2777 aese q3, v27.16b 2778 aesmc q3, q3 @ AES block 4k+7 - round 9 2779 2780 aese q1, v27.16b 2781 aesmc q1, q1 @ AES block 4k+5 - round 9 2782 2783 aese q2, v27.16b 2784 aesmc q2, q2 @ AES block 4k+6 - round 9 2785 2786 pmull v30.1q, v10.1d, q8 2787 2788 ext v10.16b, v10.16b, v10.16b, #8 2789 2790 aese q3, v28.16b 2791 aesmc q3, q3 @ AES block 4k+7 - round 10 2792 2793 aese q0, v28.16b 2794 aesmc q0, q0 @ AES block 4k+4 - round 10 2795 2796 aese q2, v28.16b 2797 aesmc q2, q2 @ AES block 4k+6 - round 10 2798 2799 aese q1, v28.16b 2800 aesmc q1, q1 @ AES block 4k+5 - round 10 2801 eor v11.16b, v11.16b, v30.16b 2802 2803 aese q0, v29.16b @ AES block 4k+4 - round 11 2804 2805 aese q3, v29.16b @ AES block 4k+7 - round 11 2806 2807 aese q2, v29.16b @ AES block 4k+6 - round 11 2808 2809 aese q1, v29.16b @ AES block 4k+5 - round 11 2810 eor v11.16b, v11.16b, v10.16b 2811 .L192_enc_tail:@ TAIL 2812 2813 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 2814 ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext 2815 #ifdef __ARMEB__ 2816 rev r6, r6 2817 rev r7, r7 2818 #endif 2819 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 2820 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 2821 2822 fmov d4, r6 @ AES block 4k+4 - mov low 2823 2824 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 2825 cmp r5, #48 2826 2827 eor q5, q4, q0 @ AES block 4k+4 - result 2828 2829 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 2830 bgt .L192_enc_blocks_more_than_3 2831 2832 sub r12, r12, #1 2833 movi v10.8b, #0 2834 2835 mov q3, q2 2836 movi q9, #0 2837 cmp r5, #32 2838 2839 mov q2, q1 2840 movi v11.8b, #0 2841 bgt .L192_enc_blocks_more_than_2 2842 2843 sub r12, r12, #1 2844 2845 mov q3, q1 2846 cmp r5, #16 2847 bgt .L192_enc_blocks_more_than_1 2848 2849 sub r12, r12, #1 2850 b .L192_enc_blocks_less_than_1 2851 .L192_enc_blocks_more_than_3:@ blocks left > 3 2852 st1 { q5}, [r2], #16 @ AES final-3 block - store result 2853 2854 ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high 2855 #ifdef __ARMEB__ 2856 rev r6, r6 2857 rev r7, r7 2858 #endif 2859 rev64 q4, q5 @ GHASH final-3 block 2860 2861 eor r6, r6, r13 @ AES final-2 block - round 12 low 2862 eor q4, q4, q8 @ feed in partial tag 2863 2864 eor r7, r7, r14 @ AES final-2 block - round 12 high 2865 fmov d5, r6 @ AES final-2 block - mov low 2866 2867 fmov v5.d[1], r7 @ AES final-2 block - mov high 2868 2869 mov d22, v4.d[1] @ GHASH final-3 block - mid 2870 2871 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 2872 2873 mov d10, v17.d[1] @ GHASH final-3 block - mid 2874 2875 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 2876 2877 movi q8, #0 @ suppress further partial tag feed in 2878 2879 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 2880 2881 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 2882 eor q5, q5, q1 @ AES final-2 block - result 2883 .L192_enc_blocks_more_than_2:@ blocks left > 2 2884 2885 st1 { q5}, [r2], #16 @ AES final-2 block - store result 2886 2887 rev64 q4, q5 @ GHASH final-2 block 2888 ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high 2889 #ifdef __ARMEB__ 2890 rev r6, r6 2891 rev r7, r7 2892 #endif 2893 eor q4, q4, q8 @ feed in partial tag 2894 2895 eor r7, r7, r14 @ AES final-1 block - round 12 high 2896 2897 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 2898 mov d22, v4.d[1] @ GHASH final-2 block - mid 2899 2900 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 2901 eor r6, r6, r13 @ AES final-1 block - round 12 low 2902 2903 fmov d5, r6 @ AES final-1 block - mov low 2904 2905 fmov v5.d[1], r7 @ AES final-1 block - mov high 2906 eor q9, q9, v20.16b @ GHASH final-2 block - high 2907 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 2908 2909 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 2910 2911 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 2912 2913 movi q8, #0 @ suppress further partial tag feed in 2914 2915 eor q5, q5, q2 @ AES final-1 block - result 2916 2917 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 2918 .L192_enc_blocks_more_than_1:@ blocks left > 1 2919 2920 st1 { q5}, [r2], #16 @ AES final-1 block - store result 2921 2922 ldp r6, r7, [r0], #16 @ AES final block - load input low & high 2923 #ifdef __ARMEB__ 2924 rev r6, r6 2925 rev r7, r7 2926 #endif 2927 rev64 q4, q5 @ GHASH final-1 block 2928 2929 eor r6, r6, r13 @ AES final block - round 12 low 2930 eor q4, q4, q8 @ feed in partial tag 2931 movi q8, #0 @ suppress further partial tag feed in 2932 2933 mov d22, v4.d[1] @ GHASH final-1 block - mid 2934 2935 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 2936 eor r7, r7, r14 @ AES final block - round 12 high 2937 fmov d5, r6 @ AES final block - mov low 2938 2939 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 2940 fmov v5.d[1], r7 @ AES final block - mov high 2941 2942 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 2943 2944 eor q9, q9, v20.16b @ GHASH final-1 block - high 2945 2946 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 2947 2948 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 2949 2950 eor q5, q5, q3 @ AES final block - result 2951 2952 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 2953 2954 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 2955 .L192_enc_blocks_less_than_1:@ blocks left <= 1 2956 2957 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored 2958 #ifndef __ARMEB__ 2959 rev r9, r12 2960 #else 2961 mov r9, r12 2962 #endif 2963 and r1, r1, #127 @ bit_length %= 128 2964 2965 sub r1, r1, #128 @ bit_length -= 128 2966 mvn r14, xzr @ rk12_h = 0xffffffffffffffff 2967 2968 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 2969 mvn r13, xzr @ rk12_l = 0xffffffffffffffff 2970 2971 and r1, r1, #127 @ bit_length %= 128 2972 2973 lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block 2974 cmp r1, #64 2975 2976 csel r6, r13, r14, lt 2977 csel r7, r14, xzr, lt 2978 2979 fmov d0, r6 @ ctr0b is mask for last block 2980 2981 fmov v0.d[1], r7 2982 2983 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 2984 2985 rev64 q4, q5 @ GHASH final block 2986 2987 eor q4, q4, q8 @ feed in partial tag 2988 2989 mov d8, v4.d[1] @ GHASH final block - mid 2990 2991 pmull v21.1q, q4, v12.1d @ GHASH final block - low 2992 2993 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 2994 2995 eor q8, q8, q4 @ GHASH final block - mid 2996 2997 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 2998 2999 eor q9, q9, v20.16b @ GHASH final block - high 3000 3001 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 3002 3003 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 3004 movi q8, #0xc2 3005 3006 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 3007 3008 shl d8, d8, #56 @ mod_constant 3009 3010 bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing 3011 3012 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 3013 3014 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 3015 3016 ext q9, q9, q9, #8 @ MODULO - other top alignment 3017 3018 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 3019 3020 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 3021 3022 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 3023 3024 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 3025 3026 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 3027 str r9, [r16, #12] @ store the updated counter 3028 3029 st1 { q5}, [r2] @ store all 16B 3030 3031 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 3032 ext v11.16b, v11.16b, v11.16b, #8 3033 rev64 v11.16b, v11.16b 3034 mov r0, r15 3035 st1 { v11.16b }, [r3] 3036 3037 ldp r21, r22, [sp, #16] 3038 ldp r23, r24, [sp, #32] 3039 ldp d8, d9, [sp, #48] 3040 ldp d10, d11, [sp, #64] 3041 ldp d12, d13, [sp, #80] 3042 ldp d14, d15, [sp, #96] 3043 ldp r19, r20, [sp], #112 3044 RET 3045 3046 .L192_enc_ret: 3047 mov r0, #0x0 3048 RET 3049 .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3050 .globl aes_gcm_dec_192_kernel 3051 .type aes_gcm_dec_192_kernel,%function 3052 .align 4 3053 aes_gcm_dec_192_kernel: 3054 AARCH64_VALID_CALL_TARGET 3055 cbz r1, .L192_dec_ret 3056 stp r19, r20, [sp, #-112]! 3057 mov r16, r4 3058 mov r8, r5 3059 stp r21, r22, [sp, #16] 3060 stp r23, r24, [sp, #32] 3061 stp d8, d9, [sp, #48] 3062 stp d10, d11, [sp, #64] 3063 stp d12, d13, [sp, #80] 3064 stp d14, d15, [sp, #96] 3065 3066 add r4, r0, r1, lsr #3 @ end_input_ptr 3067 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 3068 #ifdef __ARMEB__ 3069 rev r10, r10 3070 rev r11, r11 3071 #endif 3072 ldp r13, r14, [r8, #192] @ load rk12 3073 #ifdef __ARMEB__ 3074 ror r13, r13, #32 3075 ror r14, r14, #32 3076 #endif 3077 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 3078 3079 ld1 {v18.4s}, [r8], #16 @ load rk0 3080 3081 lsr r5, r1, #3 @ byte_len 3082 mov r15, r5 3083 ld1 {v19.4s}, [r8], #16 @ load rk1 3084 3085 lsr r12, r11, #32 3086 orr r11, r11, r11 3087 fmov d3, r10 @ CTR block 3 3088 3089 rev r12, r12 @ rev_ctr32 3090 fmov d1, r10 @ CTR block 1 3091 3092 add r12, r12, #1 @ increment rev_ctr32 3093 ld1 {v20.4s}, [r8], #16 @ load rk2 3094 3095 aese q0, v18.16b 3096 aesmc q0, q0 @ AES block 0 - round 0 3097 rev r9, r12 @ CTR block 1 3098 3099 add r12, r12, #1 @ CTR block 1 3100 orr r9, r11, r9, lsl #32 @ CTR block 1 3101 ld1 {v21.4s}, [r8], #16 @ load rk3 3102 3103 fmov v1.d[1], r9 @ CTR block 1 3104 rev r9, r12 @ CTR block 2 3105 add r12, r12, #1 @ CTR block 2 3106 3107 fmov d2, r10 @ CTR block 2 3108 orr r9, r11, r9, lsl #32 @ CTR block 2 3109 3110 fmov v2.d[1], r9 @ CTR block 2 3111 rev r9, r12 @ CTR block 3 3112 3113 aese q0, v19.16b 3114 aesmc q0, q0 @ AES block 0 - round 1 3115 orr r9, r11, r9, lsl #32 @ CTR block 3 3116 3117 fmov v3.d[1], r9 @ CTR block 3 3118 3119 ld1 {v22.4s}, [r8], #16 @ load rk4 3120 3121 aese q0, v20.16b 3122 aesmc q0, q0 @ AES block 0 - round 2 3123 3124 aese q2, v18.16b 3125 aesmc q2, q2 @ AES block 2 - round 0 3126 ld1 {v23.4s}, [r8], #16 @ load rk5 3127 3128 aese q1, v18.16b 3129 aesmc q1, q1 @ AES block 1 - round 0 3130 ldr q15, [r3, #112] @ load h4l | h4h 3131 #ifndef __ARMEB__ 3132 ext v15.16b, v15.16b, v15.16b, #8 3133 #endif 3134 aese q3, v18.16b 3135 aesmc q3, q3 @ AES block 3 - round 0 3136 ldr q13, [r3, #64] @ load h2l | h2h 3137 #ifndef __ARMEB__ 3138 ext v13.16b, v13.16b, v13.16b, #8 3139 #endif 3140 aese q2, v19.16b 3141 aesmc q2, q2 @ AES block 2 - round 1 3142 ldr q14, [r3, #80] @ load h3l | h3h 3143 #ifndef __ARMEB__ 3144 ext v14.16b, v14.16b, v14.16b, #8 3145 #endif 3146 aese q1, v19.16b 3147 aesmc q1, q1 @ AES block 1 - round 1 3148 3149 aese q3, v19.16b 3150 aesmc q3, q3 @ AES block 3 - round 1 3151 ldr q12, [r3, #32] @ load h1l | h1h 3152 #ifndef __ARMEB__ 3153 ext v12.16b, v12.16b, v12.16b, #8 3154 #endif 3155 aese q2, v20.16b 3156 aesmc q2, q2 @ AES block 2 - round 2 3157 ld1 {v24.4s}, [r8], #16 @ load rk6 3158 3159 aese q0, v21.16b 3160 aesmc q0, q0 @ AES block 0 - round 3 3161 ld1 {v25.4s}, [r8], #16 @ load rk7 3162 3163 aese q1, v20.16b 3164 aesmc q1, q1 @ AES block 1 - round 2 3165 ld1 {v26.4s}, [r8], #16 @ load rk8 3166 3167 aese q3, v20.16b 3168 aesmc q3, q3 @ AES block 3 - round 2 3169 ld1 {v27.4s}, [r8], #16 @ load rk9 3170 3171 aese q2, v21.16b 3172 aesmc q2, q2 @ AES block 2 - round 3 3173 ld1 { v11.16b}, [r3] 3174 ext v11.16b, v11.16b, v11.16b, #8 3175 rev64 v11.16b, v11.16b 3176 3177 aese q1, v21.16b 3178 aesmc q1, q1 @ AES block 1 - round 3 3179 add r12, r12, #1 @ CTR block 3 3180 3181 aese q3, v21.16b 3182 aesmc q3, q3 @ AES block 3 - round 3 3183 trn1 q9, v14.2d, v15.2d @ h4h | h3h 3184 3185 aese q0, v22.16b 3186 aesmc q0, q0 @ AES block 0 - round 4 3187 ld1 {v28.4s}, [r8], #16 @ load rk10 3188 3189 aese q1, v22.16b 3190 aesmc q1, q1 @ AES block 1 - round 4 3191 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 3192 3193 aese q2, v22.16b 3194 aesmc q2, q2 @ AES block 2 - round 4 3195 3196 aese q3, v22.16b 3197 aesmc q3, q3 @ AES block 3 - round 4 3198 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 3199 3200 aese q0, v23.16b 3201 aesmc q0, q0 @ AES block 0 - round 5 3202 ld1 {v29.4s}, [r8], #16 @ load rk11 3203 3204 aese q1, v23.16b 3205 aesmc q1, q1 @ AES block 1 - round 5 3206 3207 aese q2, v23.16b 3208 aesmc q2, q2 @ AES block 2 - round 5 3209 3210 aese q3, v23.16b 3211 aesmc q3, q3 @ AES block 3 - round 5 3212 3213 aese q0, v24.16b 3214 aesmc q0, q0 @ AES block 0 - round 6 3215 3216 aese q2, v24.16b 3217 aesmc q2, q2 @ AES block 2 - round 6 3218 3219 aese q3, v24.16b 3220 aesmc q3, q3 @ AES block 3 - round 6 3221 3222 aese q0, v25.16b 3223 aesmc q0, q0 @ AES block 0 - round 7 3224 3225 aese q2, v25.16b 3226 aesmc q2, q2 @ AES block 2 - round 7 3227 3228 aese q3, v25.16b 3229 aesmc q3, q3 @ AES block 3 - round 7 3230 3231 aese q1, v24.16b 3232 aesmc q1, q1 @ AES block 1 - round 6 3233 3234 aese q2, v26.16b 3235 aesmc q2, q2 @ AES block 2 - round 8 3236 3237 aese q3, v26.16b 3238 aesmc q3, q3 @ AES block 3 - round 8 3239 3240 aese q1, v25.16b 3241 aesmc q1, q1 @ AES block 1 - round 7 3242 3243 aese q2, v27.16b 3244 aesmc q2, q2 @ AES block 2 - round 9 3245 3246 aese q3, v27.16b 3247 aesmc q3, q3 @ AES block 3 - round 9 3248 3249 aese q1, v26.16b 3250 aesmc q1, q1 @ AES block 1 - round 8 3251 sub r5, r5, #1 @ byte_len - 1 3252 3253 aese q0, v26.16b 3254 aesmc q0, q0 @ AES block 0 - round 8 3255 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3256 3257 aese q3, v28.16b 3258 aesmc q3, q3 @ AES block 3 - round 10 3259 add r5, r5, r0 3260 3261 aese q1, v27.16b 3262 aesmc q1, q1 @ AES block 1 - round 9 3263 cmp r0, r5 @ check if we have <= 4 blocks 3264 3265 aese q0, v27.16b 3266 aesmc q0, q0 @ AES block 0 - round 9 3267 trn1 q8, v12.2d, v13.2d @ h2h | h1h 3268 3269 aese q3, v29.16b @ AES block 3 - round 11 3270 3271 aese q2, v28.16b 3272 aesmc q2, q2 @ AES block 2 - round 10 3273 3274 aese q1, v28.16b 3275 aesmc q1, q1 @ AES block 1 - round 10 3276 3277 aese q0, v28.16b 3278 aesmc q0, q0 @ AES block 0 - round 10 3279 eor v16.16b, v16.16b, q8 @ h2k | h1k 3280 3281 aese q2, v29.16b @ AES block 2 - round 11 3282 3283 aese q1, v29.16b @ AES block 1 - round 11 3284 eor v17.16b, v17.16b, q9 @ h4k | h3k 3285 3286 aese q0, v29.16b @ AES block 0 - round 11 3287 bge .L192_dec_tail @ handle tail 3288 3289 ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext 3290 3291 eor q1, q5, q1 @ AES block 1 - result 3292 3293 eor q0, q4, q0 @ AES block 0 - result 3294 rev r9, r12 @ CTR block 4 3295 ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext 3296 3297 mov r19, v1.d[0] @ AES block 1 - mov low 3298 3299 mov r20, v1.d[1] @ AES block 1 - mov high 3300 3301 mov r6, v0.d[0] @ AES block 0 - mov low 3302 orr r9, r11, r9, lsl #32 @ CTR block 4 3303 add r12, r12, #1 @ CTR block 4 3304 3305 mov r7, v0.d[1] @ AES block 0 - mov high 3306 rev64 q4, q4 @ GHASH block 0 3307 3308 fmov d0, r10 @ CTR block 4 3309 rev64 q5, q5 @ GHASH block 1 3310 cmp r0, r5 @ check if we have <= 8 blocks 3311 3312 eor r19, r19, r13 @ AES block 1 - round 12 low 3313 #ifdef __ARMEB__ 3314 rev r19, r19 3315 #endif 3316 fmov v0.d[1], r9 @ CTR block 4 3317 rev r9, r12 @ CTR block 5 3318 3319 orr r9, r11, r9, lsl #32 @ CTR block 5 3320 fmov d1, r10 @ CTR block 5 3321 eor r20, r20, r14 @ AES block 1 - round 12 high 3322 #ifdef __ARMEB__ 3323 rev r20, r20 3324 #endif 3325 add r12, r12, #1 @ CTR block 5 3326 fmov v1.d[1], r9 @ CTR block 5 3327 eor r6, r6, r13 @ AES block 0 - round 12 low 3328 #ifdef __ARMEB__ 3329 rev r6, r6 3330 #endif 3331 rev r9, r12 @ CTR block 6 3332 eor r7, r7, r14 @ AES block 0 - round 12 high 3333 #ifdef __ARMEB__ 3334 rev r7, r7 3335 #endif 3336 stp r6, r7, [r2], #16 @ AES block 0 - store result 3337 orr r9, r11, r9, lsl #32 @ CTR block 6 3338 3339 stp r19, r20, [r2], #16 @ AES block 1 - store result 3340 3341 add r12, r12, #1 @ CTR block 6 3342 eor q2, q6, q2 @ AES block 2 - result 3343 bge .L192_dec_prepretail @ do prepretail 3344 3345 .L192_dec_main_loop:@ main loop start 3346 aese q1, v18.16b 3347 aesmc q1, q1 @ AES block 4k+5 - round 0 3348 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 3349 3350 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 3351 mov r21, v2.d[0] @ AES block 4k+2 - mov low 3352 3353 mov r22, v2.d[1] @ AES block 4k+2 - mov high 3354 eor q3, q7, q3 @ AES block 4k+3 - result 3355 rev64 q7, q7 @ GHASH block 4k+3 3356 3357 aese q1, v19.16b 3358 aesmc q1, q1 @ AES block 4k+5 - round 1 3359 fmov d2, r10 @ CTR block 4k+6 3360 3361 aese q0, v18.16b 3362 aesmc q0, q0 @ AES block 4k+4 - round 0 3363 eor q4, q4, v11.16b @ PRE 1 3364 3365 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 3366 fmov v2.d[1], r9 @ CTR block 4k+6 3367 3368 aese q1, v20.16b 3369 aesmc q1, q1 @ AES block 4k+5 - round 2 3370 mov r24, v3.d[1] @ AES block 4k+3 - mov high 3371 3372 aese q0, v19.16b 3373 aesmc q0, q0 @ AES block 4k+4 - round 1 3374 mov r23, v3.d[0] @ AES block 4k+3 - mov low 3375 3376 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 3377 fmov d3, r10 @ CTR block 4k+7 3378 mov d8, v4.d[1] @ GHASH block 4k - mid 3379 3380 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 3381 mov d10, v17.d[1] @ GHASH block 4k - mid 3382 rev r9, r12 @ CTR block 4k+7 3383 3384 aese q2, v18.16b 3385 aesmc q2, q2 @ AES block 4k+6 - round 0 3386 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 3387 3388 fmov v3.d[1], r9 @ CTR block 4k+7 3389 eor q8, q8, q4 @ GHASH block 4k - mid 3390 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 3391 3392 aese q1, v21.16b 3393 aesmc q1, q1 @ AES block 4k+5 - round 3 3394 3395 aese q0, v20.16b 3396 aesmc q0, q0 @ AES block 4k+4 - round 2 3397 eor r22, r22, r14 @ AES block 4k+2 - round 12 high 3398 #ifdef __ARMEB__ 3399 rev r22, r22 3400 #endif 3401 aese q2, v19.16b 3402 aesmc q2, q2 @ AES block 4k+6 - round 1 3403 eor q4, q4, q5 @ GHASH block 4k+1 - mid 3404 3405 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 3406 3407 aese q3, v18.16b 3408 aesmc q3, q3 @ AES block 4k+7 - round 0 3409 rev64 q6, q6 @ GHASH block 4k+2 3410 3411 aese q2, v20.16b 3412 aesmc q2, q2 @ AES block 4k+6 - round 2 3413 3414 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 3415 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 3416 eor r21, r21, r13 @ AES block 4k+2 - round 12 low 3417 #ifdef __ARMEB__ 3418 rev r21, r21 3419 #endif 3420 aese q1, v22.16b 3421 aesmc q1, q1 @ AES block 4k+5 - round 4 3422 3423 aese q0, v21.16b 3424 aesmc q0, q0 @ AES block 4k+4 - round 3 3425 3426 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 3427 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 3428 3429 aese q3, v19.16b 3430 aesmc q3, q3 @ AES block 4k+7 - round 1 3431 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 3432 3433 aese q0, v22.16b 3434 aesmc q0, q0 @ AES block 4k+4 - round 4 3435 3436 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 3437 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 3438 3439 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 3440 3441 aese q0, v23.16b 3442 aesmc q0, q0 @ AES block 4k+4 - round 5 3443 3444 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 3445 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 3446 3447 aese q1, v23.16b 3448 aesmc q1, q1 @ AES block 4k+5 - round 5 3449 3450 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 3451 3452 aese q3, v20.16b 3453 aesmc q3, q3 @ AES block 4k+7 - round 2 3454 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 3455 3456 aese q1, v24.16b 3457 aesmc q1, q1 @ AES block 4k+5 - round 6 3458 3459 aese q0, v24.16b 3460 aesmc q0, q0 @ AES block 4k+4 - round 6 3461 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 3462 3463 aese q3, v21.16b 3464 aesmc q3, q3 @ AES block 4k+7 - round 3 3465 3466 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 3467 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 3468 3469 aese q0, v25.16b 3470 aesmc q0, q0 @ AES block 4k+4 - round 7 3471 3472 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 3473 eor q9, q9, q5 @ GHASH block 4k+3 - high 3474 3475 aese q1, v25.16b 3476 aesmc q1, q1 @ AES block 4k+5 - round 7 3477 3478 aese q0, v26.16b 3479 aesmc q0, q0 @ AES block 4k+4 - round 8 3480 movi q8, #0xc2 3481 3482 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 3483 3484 aese q1, v26.16b 3485 aesmc q1, q1 @ AES block 4k+5 - round 8 3486 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 3487 3488 aese q2, v21.16b 3489 aesmc q2, q2 @ AES block 4k+6 - round 3 3490 3491 aese q0, v27.16b 3492 aesmc q0, q0 @ AES block 4k+4 - round 9 3493 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 3494 3495 aese q3, v22.16b 3496 aesmc q3, q3 @ AES block 4k+7 - round 4 3497 3498 aese q2, v22.16b 3499 aesmc q2, q2 @ AES block 4k+6 - round 4 3500 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 3501 3502 aese q0, v28.16b 3503 aesmc q0, q0 @ AES block 4k+4 - round 10 3504 3505 aese q1, v27.16b 3506 aesmc q1, q1 @ AES block 4k+5 - round 9 3507 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 3508 3509 aese q2, v23.16b 3510 aesmc q2, q2 @ AES block 4k+6 - round 5 3511 3512 aese q3, v23.16b 3513 aesmc q3, q3 @ AES block 4k+7 - round 5 3514 shl d8, d8, #56 @ mod_constant 3515 3516 aese q1, v28.16b 3517 aesmc q1, q1 @ AES block 4k+5 - round 10 3518 3519 aese q2, v24.16b 3520 aesmc q2, q2 @ AES block 4k+6 - round 6 3521 ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext 3522 3523 aese q3, v24.16b 3524 aesmc q3, q3 @ AES block 4k+7 - round 6 3525 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 3526 3527 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 3528 ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext 3529 eor r23, r23, r13 @ AES block 4k+3 - round 12 low 3530 #ifdef __ARMEB__ 3531 rev r23, r23 3532 #endif 3533 aese q2, v25.16b 3534 aesmc q2, q2 @ AES block 4k+6 - round 7 3535 ext q9, q9, q9, #8 @ MODULO - other top alignment 3536 3537 aese q0, v29.16b @ AES block 4k+4 - round 11 3538 add r12, r12, #1 @ CTR block 4k+7 3539 3540 aese q3, v25.16b 3541 aesmc q3, q3 @ AES block 4k+7 - round 7 3542 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 3543 3544 aese q2, v26.16b 3545 aesmc q2, q2 @ AES block 4k+6 - round 8 3546 ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext 3547 3548 aese q1, v29.16b @ AES block 4k+5 - round 11 3549 ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext 3550 rev r9, r12 @ CTR block 4k+8 3551 3552 aese q3, v26.16b 3553 aesmc q3, q3 @ AES block 4k+7 - round 8 3554 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 3555 3556 aese q2, v27.16b 3557 aesmc q2, q2 @ AES block 4k+6 - round 9 3558 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 3559 3560 cmp r0, r5 @ .LOOP CONTROL 3561 3562 eor q0, q4, q0 @ AES block 4k+4 - result 3563 eor r24, r24, r14 @ AES block 4k+3 - round 12 high 3564 #ifdef __ARMEB__ 3565 rev r24, r24 3566 #endif 3567 eor q1, q5, q1 @ AES block 4k+5 - result 3568 3569 aese q2, v28.16b 3570 aesmc q2, q2 @ AES block 4k+6 - round 10 3571 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 3572 3573 aese q3, v27.16b 3574 aesmc q3, q3 @ AES block 4k+7 - round 9 3575 3576 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 3577 mov r19, v1.d[0] @ AES block 4k+5 - mov low 3578 3579 mov r6, v0.d[0] @ AES block 4k+4 - mov low 3580 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 3581 rev64 q5, q5 @ GHASH block 4k+5 3582 3583 aese q2, v29.16b @ AES block 4k+6 - round 11 3584 mov r7, v0.d[1] @ AES block 4k+4 - mov high 3585 3586 aese q3, v28.16b 3587 aesmc q3, q3 @ AES block 4k+7 - round 10 3588 mov r20, v1.d[1] @ AES block 4k+5 - mov high 3589 3590 fmov d0, r10 @ CTR block 4k+8 3591 add r12, r12, #1 @ CTR block 4k+8 3592 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 3593 3594 eor q2, q6, q2 @ AES block 4k+6 - result 3595 fmov v0.d[1], r9 @ CTR block 4k+8 3596 rev r9, r12 @ CTR block 4k+9 3597 3598 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 3599 #ifdef __ARMEB__ 3600 rev r6, r6 3601 #endif 3602 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 3603 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 3604 3605 fmov d1, r10 @ CTR block 4k+9 3606 add r12, r12, #1 @ CTR block 4k+9 3607 eor r19, r19, r13 @ AES block 4k+5 - round 12 low 3608 #ifdef __ARMEB__ 3609 rev r19, r19 3610 #endif 3611 fmov v1.d[1], r9 @ CTR block 4k+9 3612 rev r9, r12 @ CTR block 4k+10 3613 eor r20, r20, r14 @ AES block 4k+5 - round 12 high 3614 #ifdef __ARMEB__ 3615 rev r20, r20 3616 #endif 3617 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 3618 #ifdef __ARMEB__ 3619 rev r7, r7 3620 #endif 3621 stp r6, r7, [r2], #16 @ AES block 4k+4 - store result 3622 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 3623 3624 add r12, r12, #1 @ CTR block 4k+10 3625 rev64 q4, q4 @ GHASH block 4k+4 3626 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 3627 3628 aese q3, v29.16b @ AES block 4k+7 - round 11 3629 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result 3630 blt .L192_dec_main_loop 3631 3632 .L192_dec_prepretail:@ PREPRETAIL 3633 mov r22, v2.d[1] @ AES block 4k+2 - mov high 3634 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 3635 eor q3, q7, q3 @ AES block 4k+3 - result 3636 3637 aese q1, v18.16b 3638 aesmc q1, q1 @ AES block 4k+5 - round 0 3639 mov r21, v2.d[0] @ AES block 4k+2 - mov low 3640 3641 aese q0, v18.16b 3642 aesmc q0, q0 @ AES block 4k+4 - round 0 3643 mov d10, v17.d[1] @ GHASH block 4k - mid 3644 3645 eor q4, q4, v11.16b @ PRE 1 3646 fmov d2, r10 @ CTR block 4k+6 3647 3648 aese q1, v19.16b 3649 aesmc q1, q1 @ AES block 4k+5 - round 1 3650 mov r23, v3.d[0] @ AES block 4k+3 - mov low 3651 3652 aese q0, v19.16b 3653 aesmc q0, q0 @ AES block 4k+4 - round 1 3654 mov r24, v3.d[1] @ AES block 4k+3 - mov high 3655 3656 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 3657 mov d8, v4.d[1] @ GHASH block 4k - mid 3658 fmov d3, r10 @ CTR block 4k+7 3659 3660 aese q1, v20.16b 3661 aesmc q1, q1 @ AES block 4k+5 - round 2 3662 rev64 q6, q6 @ GHASH block 4k+2 3663 3664 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 3665 fmov v2.d[1], r9 @ CTR block 4k+6 3666 rev r9, r12 @ CTR block 4k+7 3667 3668 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 3669 eor q8, q8, q4 @ GHASH block 4k - mid 3670 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 3671 3672 pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low 3673 eor r24, r24, r14 @ AES block 4k+3 - round 12 high 3674 #ifdef __ARMEB__ 3675 rev r24, r24 3676 #endif 3677 fmov v3.d[1], r9 @ CTR block 4k+7 3678 3679 aese q0, v20.16b 3680 aesmc q0, q0 @ AES block 4k+4 - round 2 3681 eor r21, r21, r13 @ AES block 4k+2 - round 12 low 3682 #ifdef __ARMEB__ 3683 rev r21, r21 3684 #endif 3685 pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high 3686 eor r22, r22, r14 @ AES block 4k+2 - round 12 high 3687 #ifdef __ARMEB__ 3688 rev r22, r22 3689 #endif 3690 eor q4, q4, q5 @ GHASH block 4k+1 - mid 3691 3692 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 3693 eor r23, r23, r13 @ AES block 4k+3 - round 12 low 3694 #ifdef __ARMEB__ 3695 rev r23, r23 3696 #endif 3697 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 3698 3699 rev64 q7, q7 @ GHASH block 4k+3 3700 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 3701 3702 aese q3, v18.16b 3703 aesmc q3, q3 @ AES block 4k+7 - round 0 3704 eor q9, q9, v30.16b @ GHASH block 4k+1 - high 3705 3706 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 3707 add r12, r12, #1 @ CTR block 4k+7 3708 3709 pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high 3710 eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low 3711 3712 aese q2, v18.16b 3713 aesmc q2, q2 @ AES block 4k+6 - round 0 3714 3715 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 3716 mov d31, v6.d[1] @ GHASH block 4k+2 - mid 3717 3718 aese q3, v19.16b 3719 aesmc q3, q3 @ AES block 4k+7 - round 1 3720 3721 aese q2, v19.16b 3722 aesmc q2, q2 @ AES block 4k+6 - round 1 3723 eor q9, q9, v30.16b @ GHASH block 4k+2 - high 3724 3725 eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid 3726 3727 pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low 3728 3729 aese q2, v20.16b 3730 aesmc q2, q2 @ AES block 4k+6 - round 2 3731 mov d30, v7.d[1] @ GHASH block 4k+3 - mid 3732 3733 aese q3, v20.16b 3734 aesmc q3, q3 @ AES block 4k+7 - round 2 3735 ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid 3736 3737 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 3738 3739 aese q0, v21.16b 3740 aesmc q0, q0 @ AES block 4k+4 - round 3 3741 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid 3742 3743 aese q1, v21.16b 3744 aesmc q1, q1 @ AES block 4k+5 - round 3 3745 3746 pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid 3747 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low 3748 3749 aese q0, v22.16b 3750 aesmc q0, q0 @ AES block 4k+4 - round 4 3751 3752 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 3753 movi q8, #0xc2 3754 3755 pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid 3756 3757 aese q2, v21.16b 3758 aesmc q2, q2 @ AES block 4k+6 - round 3 3759 3760 shl d8, d8, #56 @ mod_constant 3761 eor q9, q9, q5 @ GHASH block 4k+3 - high 3762 3763 aese q0, v23.16b 3764 aesmc q0, q0 @ AES block 4k+4 - round 5 3765 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid 3766 3767 aese q2, v22.16b 3768 aesmc q2, q2 @ AES block 4k+6 - round 4 3769 3770 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 3771 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 3772 3773 aese q0, v24.16b 3774 aesmc q0, q0 @ AES block 4k+4 - round 6 3775 3776 aese q3, v21.16b 3777 aesmc q3, q3 @ AES block 4k+7 - round 3 3778 eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid 3779 3780 aese q2, v23.16b 3781 aesmc q2, q2 @ AES block 4k+6 - round 5 3782 3783 aese q0, v25.16b 3784 aesmc q0, q0 @ AES block 4k+4 - round 7 3785 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 3786 3787 aese q3, v22.16b 3788 aesmc q3, q3 @ AES block 4k+7 - round 4 3789 3790 aese q2, v24.16b 3791 aesmc q2, q2 @ AES block 4k+6 - round 6 3792 ext q9, q9, q9, #8 @ MODULO - other top alignment 3793 3794 aese q0, v26.16b 3795 aesmc q0, q0 @ AES block 4k+4 - round 8 3796 3797 aese q3, v23.16b 3798 aesmc q3, q3 @ AES block 4k+7 - round 5 3799 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 3800 3801 aese q1, v22.16b 3802 aesmc q1, q1 @ AES block 4k+5 - round 4 3803 3804 aese q2, v25.16b 3805 aesmc q2, q2 @ AES block 4k+6 - round 7 3806 3807 aese q0, v27.16b 3808 aesmc q0, q0 @ AES block 4k+4 - round 9 3809 3810 aese q1, v23.16b 3811 aesmc q1, q1 @ AES block 4k+5 - round 5 3812 3813 aese q3, v24.16b 3814 aesmc q3, q3 @ AES block 4k+7 - round 6 3815 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 3816 3817 aese q0, v28.16b 3818 aesmc q0, q0 @ AES block 4k+4 - round 10 3819 3820 aese q1, v24.16b 3821 aesmc q1, q1 @ AES block 4k+5 - round 6 3822 3823 aese q3, v25.16b 3824 aesmc q3, q3 @ AES block 4k+7 - round 7 3825 3826 aese q2, v26.16b 3827 aesmc q2, q2 @ AES block 4k+6 - round 8 3828 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 3829 3830 aese q1, v25.16b 3831 aesmc q1, q1 @ AES block 4k+5 - round 7 3832 3833 aese q3, v26.16b 3834 aesmc q3, q3 @ AES block 4k+7 - round 8 3835 3836 aese q2, v27.16b 3837 aesmc q2, q2 @ AES block 4k+6 - round 9 3838 3839 aese q1, v26.16b 3840 aesmc q1, q1 @ AES block 4k+5 - round 8 3841 3842 aese q3, v27.16b 3843 aesmc q3, q3 @ AES block 4k+7 - round 9 3844 3845 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 3846 3847 aese q1, v27.16b 3848 aesmc q1, q1 @ AES block 4k+5 - round 9 3849 3850 aese q2, v28.16b 3851 aesmc q2, q2 @ AES block 4k+6 - round 10 3852 3853 aese q3, v28.16b 3854 aesmc q3, q3 @ AES block 4k+7 - round 10 3855 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 3856 3857 aese q1, v28.16b 3858 aesmc q1, q1 @ AES block 4k+5 - round 10 3859 3860 aese q0, v29.16b 3861 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 3862 3863 aese q2, v29.16b 3864 3865 aese q1, v29.16b 3866 3867 aese q3, v29.16b 3868 3869 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 3870 .L192_dec_tail:@ TAIL 3871 3872 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 3873 ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 3874 3875 eor q0, q5, q0 @ AES block 4k+4 - result 3876 3877 mov r7, v0.d[1] @ AES block 4k+4 - mov high 3878 3879 mov r6, v0.d[0] @ AES block 4k+4 - mov low 3880 3881 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 3882 3883 cmp r5, #48 3884 3885 eor r7, r7, r14 @ AES block 4k+4 - round 12 high 3886 #ifdef __ARMEB__ 3887 rev r7, r7 3888 #endif 3889 eor r6, r6, r13 @ AES block 4k+4 - round 12 low 3890 #ifdef __ARMEB__ 3891 rev r6, r6 3892 #endif 3893 bgt .L192_dec_blocks_more_than_3 3894 3895 movi v11.8b, #0 3896 movi q9, #0 3897 3898 mov q3, q2 3899 mov q2, q1 3900 sub r12, r12, #1 3901 3902 movi v10.8b, #0 3903 cmp r5, #32 3904 bgt .L192_dec_blocks_more_than_2 3905 3906 mov q3, q1 3907 cmp r5, #16 3908 sub r12, r12, #1 3909 3910 bgt .L192_dec_blocks_more_than_1 3911 3912 sub r12, r12, #1 3913 b .L192_dec_blocks_less_than_1 3914 .L192_dec_blocks_more_than_3:@ blocks left > 3 3915 rev64 q4, q5 @ GHASH final-3 block 3916 ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext 3917 3918 stp r6, r7, [r2], #16 @ AES final-3 block - store result 3919 3920 eor q4, q4, q8 @ feed in partial tag 3921 3922 eor q0, q5, q1 @ AES final-2 block - result 3923 3924 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 3925 mov r6, v0.d[0] @ AES final-2 block - mov low 3926 mov d22, v4.d[1] @ GHASH final-3 block - mid 3927 3928 mov r7, v0.d[1] @ AES final-2 block - mov high 3929 3930 mov d10, v17.d[1] @ GHASH final-3 block - mid 3931 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 3932 3933 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 3934 3935 eor r6, r6, r13 @ AES final-2 block - round 12 low 3936 #ifdef __ARMEB__ 3937 rev r6, r6 3938 #endif 3939 movi q8, #0 @ suppress further partial tag feed in 3940 3941 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 3942 eor r7, r7, r14 @ AES final-2 block - round 12 high 3943 #ifdef __ARMEB__ 3944 rev r7, r7 3945 #endif 3946 .L192_dec_blocks_more_than_2:@ blocks left > 2 3947 3948 rev64 q4, q5 @ GHASH final-2 block 3949 ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext 3950 3951 eor q4, q4, q8 @ feed in partial tag 3952 3953 movi q8, #0 @ suppress further partial tag feed in 3954 3955 eor q0, q5, q2 @ AES final-1 block - result 3956 3957 mov d22, v4.d[1] @ GHASH final-2 block - mid 3958 3959 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 3960 3961 stp r6, r7, [r2], #16 @ AES final-2 block - store result 3962 3963 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 3964 mov r7, v0.d[1] @ AES final-1 block - mov high 3965 3966 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 3967 mov r6, v0.d[0] @ AES final-1 block - mov low 3968 3969 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 3970 3971 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 3972 3973 eor q9, q9, v20.16b @ GHASH final-2 block - high 3974 eor r7, r7, r14 @ AES final-1 block - round 12 high 3975 #ifdef __ARMEB__ 3976 rev r7, r7 3977 #endif 3978 eor r6, r6, r13 @ AES final-1 block - round 12 low 3979 #ifdef __ARMEB__ 3980 rev r6, r6 3981 #endif 3982 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 3983 .L192_dec_blocks_more_than_1:@ blocks left > 1 3984 3985 rev64 q4, q5 @ GHASH final-1 block 3986 3987 eor q4, q4, q8 @ feed in partial tag 3988 ld1 { q5}, [r0], #16 @ AES final block - load ciphertext 3989 3990 mov d22, v4.d[1] @ GHASH final-1 block - mid 3991 3992 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 3993 3994 eor q0, q5, q3 @ AES final block - result 3995 stp r6, r7, [r2], #16 @ AES final-1 block - store result 3996 3997 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 3998 3999 eor q9, q9, v20.16b @ GHASH final-1 block - high 4000 4001 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 4002 mov r7, v0.d[1] @ AES final block - mov high 4003 4004 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 4005 mov r6, v0.d[0] @ AES final block - mov low 4006 4007 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 4008 4009 movi q8, #0 @ suppress further partial tag feed in 4010 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 4011 eor r7, r7, r14 @ AES final block - round 12 high 4012 #ifdef __ARMEB__ 4013 rev r7, r7 4014 #endif 4015 eor r6, r6, r13 @ AES final block - round 12 low 4016 #ifdef __ARMEB__ 4017 rev r6, r6 4018 #endif 4019 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 4020 .L192_dec_blocks_less_than_1:@ blocks left <= 1 4021 4022 mvn r13, xzr @ rk12_l = 0xffffffffffffffff 4023 ldp r4, r5, [r2] @ load existing bytes we need to not overwrite 4024 and r1, r1, #127 @ bit_length %= 128 4025 4026 sub r1, r1, #128 @ bit_length -= 128 4027 4028 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 4029 4030 and r1, r1, #127 @ bit_length %= 128 4031 mvn r14, xzr @ rk12_h = 0xffffffffffffffff 4032 4033 lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block 4034 cmp r1, #64 4035 4036 csel r9, r13, r14, lt 4037 csel r10, r14, xzr, lt 4038 4039 fmov d0, r9 @ ctr0b is mask for last block 4040 and r6, r6, r9 4041 bic r4, r4, r9 @ mask out low existing bytes 4042 4043 orr r6, r6, r4 4044 mov v0.d[1], r10 4045 #ifndef __ARMEB__ 4046 rev r9, r12 4047 #else 4048 mov r9, r12 4049 #endif 4050 4051 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 4052 str r9, [r16, #12] @ store the updated counter 4053 4054 rev64 q4, q5 @ GHASH final block 4055 4056 eor q4, q4, q8 @ feed in partial tag 4057 bic r5, r5, r10 @ mask out high existing bytes 4058 4059 and r7, r7, r10 4060 4061 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 4062 mov d8, v4.d[1] @ GHASH final block - mid 4063 4064 pmull v21.1q, q4, v12.1d @ GHASH final block - low 4065 4066 eor q8, q8, q4 @ GHASH final block - mid 4067 4068 eor q9, q9, v20.16b @ GHASH final block - high 4069 4070 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 4071 4072 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 4073 4074 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 4075 movi q8, #0xc2 4076 4077 eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up 4078 4079 shl d8, d8, #56 @ mod_constant 4080 4081 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up 4082 4083 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid 4084 orr r7, r7, r5 4085 stp r6, r7, [r2] 4086 4087 ext q9, q9, q9, #8 @ MODULO - other top alignment 4088 4089 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid 4090 4091 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 4092 4093 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 4094 4095 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 4096 4097 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 4098 4099 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 4100 ext v11.16b, v11.16b, v11.16b, #8 4101 rev64 v11.16b, v11.16b 4102 mov r0, r15 4103 st1 { v11.16b }, [r3] 4104 4105 ldp r21, r22, [sp, #16] 4106 ldp r23, r24, [sp, #32] 4107 ldp d8, d9, [sp, #48] 4108 ldp d10, d11, [sp, #64] 4109 ldp d12, d13, [sp, #80] 4110 ldp d14, d15, [sp, #96] 4111 ldp r19, r20, [sp], #112 4112 RET 4113 4114 .L192_dec_ret: 4115 mov r0, #0x0 4116 RET 4117 .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 4118 .globl aes_gcm_enc_256_kernel 4119 .type aes_gcm_enc_256_kernel,%function 4120 .align 4 4121 aes_gcm_enc_256_kernel: 4122 AARCH64_VALID_CALL_TARGET 4123 cbz r1, .L256_enc_ret 4124 stp r19, r20, [sp, #-112]! 4125 mov r16, r4 4126 mov r8, r5 4127 stp r21, r22, [sp, #16] 4128 stp r23, r24, [sp, #32] 4129 stp d8, d9, [sp, #48] 4130 stp d10, d11, [sp, #64] 4131 stp d12, d13, [sp, #80] 4132 stp d14, d15, [sp, #96] 4133 4134 add r4, r0, r1, lsr #3 @ end_input_ptr 4135 lsr r5, r1, #3 @ byte_len 4136 mov r15, r5 4137 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 4138 #ifdef __ARMEB__ 4139 rev r10, r10 4140 rev r11, r11 4141 #endif 4142 ldp r13, r14, [r8, #224] @ load rk14 4143 #ifdef __ARMEB__ 4144 ror r13, r13, #32 4145 ror r14, r14, #32 4146 #endif 4147 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 4148 sub r5, r5, #1 @ byte_len - 1 4149 4150 ld1 {v18.4s}, [r8], #16 @ load rk0 4151 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4152 4153 ld1 {v19.4s}, [r8], #16 @ load rk1 4154 add r5, r5, r0 4155 4156 lsr r12, r11, #32 4157 fmov d2, r10 @ CTR block 2 4158 orr r11, r11, r11 4159 4160 rev r12, r12 @ rev_ctr32 4161 cmp r0, r5 @ check if we have <= 4 blocks 4162 fmov d1, r10 @ CTR block 1 4163 4164 aese q0, v18.16b 4165 aesmc q0, q0 @ AES block 0 - round 0 4166 add r12, r12, #1 @ increment rev_ctr32 4167 4168 rev r9, r12 @ CTR block 1 4169 fmov d3, r10 @ CTR block 3 4170 4171 orr r9, r11, r9, lsl #32 @ CTR block 1 4172 add r12, r12, #1 @ CTR block 1 4173 ld1 {v20.4s}, [r8], #16 @ load rk2 4174 4175 fmov v1.d[1], r9 @ CTR block 1 4176 rev r9, r12 @ CTR block 2 4177 add r12, r12, #1 @ CTR block 2 4178 4179 orr r9, r11, r9, lsl #32 @ CTR block 2 4180 ld1 {v21.4s}, [r8], #16 @ load rk3 4181 4182 fmov v2.d[1], r9 @ CTR block 2 4183 rev r9, r12 @ CTR block 3 4184 4185 aese q0, v19.16b 4186 aesmc q0, q0 @ AES block 0 - round 1 4187 orr r9, r11, r9, lsl #32 @ CTR block 3 4188 4189 fmov v3.d[1], r9 @ CTR block 3 4190 4191 aese q1, v18.16b 4192 aesmc q1, q1 @ AES block 1 - round 0 4193 ld1 {v22.4s}, [r8], #16 @ load rk4 4194 4195 aese q0, v20.16b 4196 aesmc q0, q0 @ AES block 0 - round 2 4197 ld1 {v23.4s}, [r8], #16 @ load rk5 4198 4199 aese q2, v18.16b 4200 aesmc q2, q2 @ AES block 2 - round 0 4201 ld1 {v24.4s}, [r8], #16 @ load rk6 4202 4203 aese q1, v19.16b 4204 aesmc q1, q1 @ AES block 1 - round 1 4205 ldr q14, [r3, #80] @ load h3l | h3h 4206 #ifndef __ARMEB__ 4207 ext v14.16b, v14.16b, v14.16b, #8 4208 #endif 4209 aese q3, v18.16b 4210 aesmc q3, q3 @ AES block 3 - round 0 4211 ld1 {v25.4s}, [r8], #16 @ load rk7 4212 4213 aese q2, v19.16b 4214 aesmc q2, q2 @ AES block 2 - round 1 4215 ld1 {v26.4s}, [r8], #16 @ load rk8 4216 4217 aese q1, v20.16b 4218 aesmc q1, q1 @ AES block 1 - round 2 4219 ldr q13, [r3, #64] @ load h2l | h2h 4220 #ifndef __ARMEB__ 4221 ext v13.16b, v13.16b, v13.16b, #8 4222 #endif 4223 aese q3, v19.16b 4224 aesmc q3, q3 @ AES block 3 - round 1 4225 ld1 {v27.4s}, [r8], #16 @ load rk9 4226 4227 aese q2, v20.16b 4228 aesmc q2, q2 @ AES block 2 - round 2 4229 ldr q15, [r3, #112] @ load h4l | h4h 4230 #ifndef __ARMEB__ 4231 ext v15.16b, v15.16b, v15.16b, #8 4232 #endif 4233 aese q1, v21.16b 4234 aesmc q1, q1 @ AES block 1 - round 3 4235 ld1 {v28.4s}, [r8], #16 @ load rk10 4236 4237 aese q3, v20.16b 4238 aesmc q3, q3 @ AES block 3 - round 2 4239 ld1 {v29.4s}, [r8], #16 @ load rk11 4240 4241 aese q2, v21.16b 4242 aesmc q2, q2 @ AES block 2 - round 3 4243 add r12, r12, #1 @ CTR block 3 4244 4245 aese q0, v21.16b 4246 aesmc q0, q0 @ AES block 0 - round 3 4247 4248 aese q3, v21.16b 4249 aesmc q3, q3 @ AES block 3 - round 3 4250 ld1 { v11.16b}, [r3] 4251 ext v11.16b, v11.16b, v11.16b, #8 4252 rev64 v11.16b, v11.16b 4253 4254 aese q2, v22.16b 4255 aesmc q2, q2 @ AES block 2 - round 4 4256 4257 aese q0, v22.16b 4258 aesmc q0, q0 @ AES block 0 - round 4 4259 4260 aese q1, v22.16b 4261 aesmc q1, q1 @ AES block 1 - round 4 4262 4263 aese q3, v22.16b 4264 aesmc q3, q3 @ AES block 3 - round 4 4265 4266 aese q0, v23.16b 4267 aesmc q0, q0 @ AES block 0 - round 5 4268 4269 aese q1, v23.16b 4270 aesmc q1, q1 @ AES block 1 - round 5 4271 4272 aese q3, v23.16b 4273 aesmc q3, q3 @ AES block 3 - round 5 4274 4275 aese q2, v23.16b 4276 aesmc q2, q2 @ AES block 2 - round 5 4277 4278 aese q1, v24.16b 4279 aesmc q1, q1 @ AES block 1 - round 6 4280 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 4281 4282 aese q3, v24.16b 4283 aesmc q3, q3 @ AES block 3 - round 6 4284 ld1 {v30.4s}, [r8], #16 @ load rk12 4285 4286 aese q0, v24.16b 4287 aesmc q0, q0 @ AES block 0 - round 6 4288 ldr q12, [r3, #32] @ load h1l | h1h 4289 #ifndef __ARMEB__ 4290 ext v12.16b, v12.16b, v12.16b, #8 4291 #endif 4292 aese q2, v24.16b 4293 aesmc q2, q2 @ AES block 2 - round 6 4294 ld1 {v31.4s}, [r8], #16 @ load rk13 4295 4296 aese q1, v25.16b 4297 aesmc q1, q1 @ AES block 1 - round 7 4298 trn1 q9, v14.2d, v15.2d @ h4h | h3h 4299 4300 aese q0, v25.16b 4301 aesmc q0, q0 @ AES block 0 - round 7 4302 4303 aese q2, v25.16b 4304 aesmc q2, q2 @ AES block 2 - round 7 4305 4306 aese q3, v25.16b 4307 aesmc q3, q3 @ AES block 3 - round 7 4308 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 4309 4310 aese q1, v26.16b 4311 aesmc q1, q1 @ AES block 1 - round 8 4312 4313 aese q2, v26.16b 4314 aesmc q2, q2 @ AES block 2 - round 8 4315 4316 aese q3, v26.16b 4317 aesmc q3, q3 @ AES block 3 - round 8 4318 4319 aese q1, v27.16b 4320 aesmc q1, q1 @ AES block 1 - round 9 4321 4322 aese q2, v27.16b 4323 aesmc q2, q2 @ AES block 2 - round 9 4324 4325 aese q0, v26.16b 4326 aesmc q0, q0 @ AES block 0 - round 8 4327 4328 aese q1, v28.16b 4329 aesmc q1, q1 @ AES block 1 - round 10 4330 4331 aese q3, v27.16b 4332 aesmc q3, q3 @ AES block 3 - round 9 4333 4334 aese q0, v27.16b 4335 aesmc q0, q0 @ AES block 0 - round 9 4336 4337 aese q2, v28.16b 4338 aesmc q2, q2 @ AES block 2 - round 10 4339 4340 aese q3, v28.16b 4341 aesmc q3, q3 @ AES block 3 - round 10 4342 4343 aese q1, v29.16b 4344 aesmc q1, q1 @ AES block 1 - round 11 4345 4346 aese q2, v29.16b 4347 aesmc q2, q2 @ AES block 2 - round 11 4348 4349 aese q0, v28.16b 4350 aesmc q0, q0 @ AES block 0 - round 10 4351 4352 aese q1, v30.16b 4353 aesmc q1, q1 @ AES block 1 - round 12 4354 4355 aese q2, v30.16b 4356 aesmc q2, q2 @ AES block 2 - round 12 4357 4358 aese q0, v29.16b 4359 aesmc q0, q0 @ AES block 0 - round 11 4360 eor v17.16b, v17.16b, q9 @ h4k | h3k 4361 4362 aese q3, v29.16b 4363 aesmc q3, q3 @ AES block 3 - round 11 4364 4365 aese q2, v31.16b @ AES block 2 - round 13 4366 trn1 q8, v12.2d, v13.2d @ h2h | h1h 4367 4368 aese q0, v30.16b 4369 aesmc q0, q0 @ AES block 0 - round 12 4370 4371 aese q3, v30.16b 4372 aesmc q3, q3 @ AES block 3 - round 12 4373 4374 aese q1, v31.16b @ AES block 1 - round 13 4375 4376 aese q0, v31.16b @ AES block 0 - round 13 4377 4378 aese q3, v31.16b @ AES block 3 - round 13 4379 eor v16.16b, v16.16b, q8 @ h2k | h1k 4380 bge .L256_enc_tail @ handle tail 4381 4382 ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext 4383 #ifdef __ARMEB__ 4384 rev r19, r19 4385 rev r20, r20 4386 #endif 4387 rev r9, r12 @ CTR block 4 4388 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext 4389 #ifdef __ARMEB__ 4390 rev r6, r6 4391 rev r7, r7 4392 #endif 4393 ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext 4394 #ifdef __ARMEB__ 4395 rev r23, r23 4396 rev r24, r24 4397 #endif 4398 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext 4399 #ifdef __ARMEB__ 4400 rev r21, r21 4401 rev r22, r22 4402 #endif 4403 add r0, r0, #64 @ AES input_ptr update 4404 4405 eor r19, r19, r13 @ AES block 1 - round 14 low 4406 eor r20, r20, r14 @ AES block 1 - round 14 high 4407 4408 fmov d5, r19 @ AES block 1 - mov low 4409 eor r6, r6, r13 @ AES block 0 - round 14 low 4410 4411 eor r7, r7, r14 @ AES block 0 - round 14 high 4412 eor r24, r24, r14 @ AES block 3 - round 14 high 4413 fmov d4, r6 @ AES block 0 - mov low 4414 4415 cmp r0, r5 @ check if we have <= 8 blocks 4416 fmov v4.d[1], r7 @ AES block 0 - mov high 4417 eor r23, r23, r13 @ AES block 3 - round 14 low 4418 4419 eor r21, r21, r13 @ AES block 2 - round 14 low 4420 fmov v5.d[1], r20 @ AES block 1 - mov high 4421 4422 fmov d6, r21 @ AES block 2 - mov low 4423 add r12, r12, #1 @ CTR block 4 4424 4425 orr r9, r11, r9, lsl #32 @ CTR block 4 4426 fmov d7, r23 @ AES block 3 - mov low 4427 eor r22, r22, r14 @ AES block 2 - round 14 high 4428 4429 fmov v6.d[1], r22 @ AES block 2 - mov high 4430 4431 eor q4, q4, q0 @ AES block 0 - result 4432 fmov d0, r10 @ CTR block 4 4433 4434 fmov v0.d[1], r9 @ CTR block 4 4435 rev r9, r12 @ CTR block 5 4436 add r12, r12, #1 @ CTR block 5 4437 4438 eor q5, q5, q1 @ AES block 1 - result 4439 fmov d1, r10 @ CTR block 5 4440 orr r9, r11, r9, lsl #32 @ CTR block 5 4441 4442 fmov v1.d[1], r9 @ CTR block 5 4443 rev r9, r12 @ CTR block 6 4444 st1 { q4}, [r2], #16 @ AES block 0 - store result 4445 4446 fmov v7.d[1], r24 @ AES block 3 - mov high 4447 orr r9, r11, r9, lsl #32 @ CTR block 6 4448 eor q6, q6, q2 @ AES block 2 - result 4449 4450 st1 { q5}, [r2], #16 @ AES block 1 - store result 4451 4452 add r12, r12, #1 @ CTR block 6 4453 fmov d2, r10 @ CTR block 6 4454 4455 fmov v2.d[1], r9 @ CTR block 6 4456 st1 { q6}, [r2], #16 @ AES block 2 - store result 4457 rev r9, r12 @ CTR block 7 4458 4459 orr r9, r11, r9, lsl #32 @ CTR block 7 4460 4461 eor q7, q7, q3 @ AES block 3 - result 4462 st1 { q7}, [r2], #16 @ AES block 3 - store result 4463 bge .L256_enc_prepretail @ do prepretail 4464 4465 .L256_enc_main_loop:@ main loop start 4466 aese q0, v18.16b 4467 aesmc q0, q0 @ AES block 4k+4 - round 0 4468 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 4469 4470 aese q1, v18.16b 4471 aesmc q1, q1 @ AES block 4k+5 - round 0 4472 fmov d3, r10 @ CTR block 4k+3 4473 4474 aese q2, v18.16b 4475 aesmc q2, q2 @ AES block 4k+6 - round 0 4476 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 4477 4478 aese q0, v19.16b 4479 aesmc q0, q0 @ AES block 4k+4 - round 1 4480 fmov v3.d[1], r9 @ CTR block 4k+3 4481 4482 aese q1, v19.16b 4483 aesmc q1, q1 @ AES block 4k+5 - round 1 4484 ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext 4485 #ifdef __ARMEB__ 4486 rev r23, r23 4487 rev r24, r24 4488 #endif 4489 aese q2, v19.16b 4490 aesmc q2, q2 @ AES block 4k+6 - round 1 4491 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext 4492 #ifdef __ARMEB__ 4493 rev r21, r21 4494 rev r22, r22 4495 #endif 4496 aese q0, v20.16b 4497 aesmc q0, q0 @ AES block 4k+4 - round 2 4498 eor q4, q4, v11.16b @ PRE 1 4499 4500 aese q1, v20.16b 4501 aesmc q1, q1 @ AES block 4k+5 - round 2 4502 4503 aese q3, v18.16b 4504 aesmc q3, q3 @ AES block 4k+7 - round 0 4505 eor r23, r23, r13 @ AES block 4k+7 - round 14 low 4506 4507 aese q0, v21.16b 4508 aesmc q0, q0 @ AES block 4k+4 - round 3 4509 mov d10, v17.d[1] @ GHASH block 4k - mid 4510 4511 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 4512 eor r22, r22, r14 @ AES block 4k+6 - round 14 high 4513 mov d8, v4.d[1] @ GHASH block 4k - mid 4514 4515 aese q3, v19.16b 4516 aesmc q3, q3 @ AES block 4k+7 - round 1 4517 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 4518 4519 aese q0, v22.16b 4520 aesmc q0, q0 @ AES block 4k+4 - round 4 4521 4522 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 4523 eor q8, q8, q4 @ GHASH block 4k - mid 4524 4525 aese q2, v20.16b 4526 aesmc q2, q2 @ AES block 4k+6 - round 2 4527 4528 aese q0, v23.16b 4529 aesmc q0, q0 @ AES block 4k+4 - round 5 4530 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4531 4532 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 4533 4534 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 4535 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 4536 4537 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 4538 4539 eor q9, q9, q4 @ GHASH block 4k+1 - high 4540 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 4541 4542 aese q1, v21.16b 4543 aesmc q1, q1 @ AES block 4k+5 - round 3 4544 4545 aese q3, v20.16b 4546 aesmc q3, q3 @ AES block 4k+7 - round 2 4547 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 4548 4549 aese q2, v21.16b 4550 aesmc q2, q2 @ AES block 4k+6 - round 3 4551 4552 aese q1, v22.16b 4553 aesmc q1, q1 @ AES block 4k+5 - round 4 4554 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 4555 4556 aese q3, v21.16b 4557 aesmc q3, q3 @ AES block 4k+7 - round 3 4558 eor q4, q4, q5 @ GHASH block 4k+1 - mid 4559 4560 aese q2, v22.16b 4561 aesmc q2, q2 @ AES block 4k+6 - round 4 4562 4563 aese q0, v24.16b 4564 aesmc q0, q0 @ AES block 4k+4 - round 6 4565 eor q8, q8, q6 @ GHASH block 4k+2 - mid 4566 4567 aese q3, v22.16b 4568 aesmc q3, q3 @ AES block 4k+7 - round 4 4569 4570 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 4571 4572 aese q0, v25.16b 4573 aesmc q0, q0 @ AES block 4k+4 - round 7 4574 4575 aese q3, v23.16b 4576 aesmc q3, q3 @ AES block 4k+7 - round 5 4577 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 4578 4579 aese q1, v23.16b 4580 aesmc q1, q1 @ AES block 4k+5 - round 5 4581 4582 aese q0, v26.16b 4583 aesmc q0, q0 @ AES block 4k+4 - round 8 4584 4585 aese q2, v23.16b 4586 aesmc q2, q2 @ AES block 4k+6 - round 5 4587 4588 aese q1, v24.16b 4589 aesmc q1, q1 @ AES block 4k+5 - round 6 4590 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 4591 4592 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 4593 4594 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 4595 4596 aese q1, v25.16b 4597 aesmc q1, q1 @ AES block 4k+5 - round 7 4598 4599 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 4600 eor q9, q9, q4 @ GHASH block 4k+2 - high 4601 4602 aese q3, v24.16b 4603 aesmc q3, q3 @ AES block 4k+7 - round 6 4604 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext 4605 #ifdef __ARMEB__ 4606 rev r19, r19 4607 rev r20, r20 4608 #endif 4609 aese q1, v26.16b 4610 aesmc q1, q1 @ AES block 4k+5 - round 8 4611 mov d4, v7.d[1] @ GHASH block 4k+3 - mid 4612 4613 aese q2, v24.16b 4614 aesmc q2, q2 @ AES block 4k+6 - round 6 4615 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 4616 4617 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 4618 4619 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 4620 eor q4, q4, q7 @ GHASH block 4k+3 - mid 4621 4622 aese q2, v25.16b 4623 aesmc q2, q2 @ AES block 4k+6 - round 7 4624 eor r19, r19, r13 @ AES block 4k+5 - round 14 low 4625 4626 aese q1, v27.16b 4627 aesmc q1, q1 @ AES block 4k+5 - round 9 4628 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 4629 4630 aese q3, v25.16b 4631 aesmc q3, q3 @ AES block 4k+7 - round 7 4632 eor r21, r21, r13 @ AES block 4k+6 - round 14 low 4633 4634 aese q0, v27.16b 4635 aesmc q0, q0 @ AES block 4k+4 - round 9 4636 movi q8, #0xc2 4637 4638 pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid 4639 eor q9, q9, q5 @ GHASH block 4k+3 - high 4640 fmov d5, r19 @ AES block 4k+5 - mov low 4641 4642 aese q2, v26.16b 4643 aesmc q2, q2 @ AES block 4k+6 - round 8 4644 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext 4645 #ifdef __ARMEB__ 4646 rev r6, r6 4647 rev r7, r7 4648 #endif 4649 aese q0, v28.16b 4650 aesmc q0, q0 @ AES block 4k+4 - round 10 4651 shl d8, d8, #56 @ mod_constant 4652 4653 aese q3, v26.16b 4654 aesmc q3, q3 @ AES block 4k+7 - round 8 4655 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 4656 4657 aese q2, v27.16b 4658 aesmc q2, q2 @ AES block 4k+6 - round 9 4659 4660 aese q1, v28.16b 4661 aesmc q1, q1 @ AES block 4k+5 - round 10 4662 eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid 4663 4664 aese q3, v27.16b 4665 aesmc q3, q3 @ AES block 4k+7 - round 9 4666 add r12, r12, #1 @ CTR block 4k+3 4667 4668 aese q0, v29.16b 4669 aesmc q0, q0 @ AES block 4k+4 - round 11 4670 eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up 4671 4672 aese q1, v29.16b 4673 aesmc q1, q1 @ AES block 4k+5 - round 11 4674 add r0, r0, #64 @ AES input_ptr update 4675 4676 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 4677 rev r9, r12 @ CTR block 4k+8 4678 ext q9, q9, q9, #8 @ MODULO - other top alignment 4679 4680 aese q2, v28.16b 4681 aesmc q2, q2 @ AES block 4k+6 - round 10 4682 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 4683 4684 aese q1, v30.16b 4685 aesmc q1, q1 @ AES block 4k+5 - round 12 4686 eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up 4687 4688 aese q3, v28.16b 4689 aesmc q3, q3 @ AES block 4k+7 - round 10 4690 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 4691 4692 fmov d4, r6 @ AES block 4k+4 - mov low 4693 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 4694 eor q7, q9, q7 @ MODULO - fold into mid 4695 4696 aese q0, v30.16b 4697 aesmc q0, q0 @ AES block 4k+4 - round 12 4698 eor r20, r20, r14 @ AES block 4k+5 - round 14 high 4699 4700 aese q2, v29.16b 4701 aesmc q2, q2 @ AES block 4k+6 - round 11 4702 eor r24, r24, r14 @ AES block 4k+7 - round 14 high 4703 4704 aese q3, v29.16b 4705 aesmc q3, q3 @ AES block 4k+7 - round 11 4706 add r12, r12, #1 @ CTR block 4k+8 4707 4708 aese q0, v31.16b @ AES block 4k+4 - round 13 4709 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 4710 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 4711 4712 aese q2, v30.16b 4713 aesmc q2, q2 @ AES block 4k+6 - round 12 4714 fmov d7, r23 @ AES block 4k+7 - mov low 4715 4716 aese q1, v31.16b @ AES block 4k+5 - round 13 4717 fmov v5.d[1], r20 @ AES block 4k+5 - mov high 4718 4719 fmov d6, r21 @ AES block 4k+6 - mov low 4720 cmp r0, r5 @ .LOOP CONTROL 4721 4722 fmov v6.d[1], r22 @ AES block 4k+6 - mov high 4723 4724 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 4725 eor q4, q4, q0 @ AES block 4k+4 - result 4726 fmov d0, r10 @ CTR block 4k+8 4727 4728 fmov v0.d[1], r9 @ CTR block 4k+8 4729 rev r9, r12 @ CTR block 4k+9 4730 add r12, r12, #1 @ CTR block 4k+9 4731 4732 eor q5, q5, q1 @ AES block 4k+5 - result 4733 fmov d1, r10 @ CTR block 4k+9 4734 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 4735 4736 aese q3, v30.16b 4737 aesmc q3, q3 @ AES block 4k+7 - round 12 4738 fmov v1.d[1], r9 @ CTR block 4k+9 4739 4740 aese q2, v31.16b @ AES block 4k+6 - round 13 4741 rev r9, r12 @ CTR block 4k+10 4742 st1 { q4}, [r2], #16 @ AES block 4k+4 - store result 4743 4744 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 4745 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 4746 fmov v7.d[1], r24 @ AES block 4k+7 - mov high 4747 4748 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 4749 st1 { q5}, [r2], #16 @ AES block 4k+5 - store result 4750 add r12, r12, #1 @ CTR block 4k+10 4751 4752 aese q3, v31.16b @ AES block 4k+7 - round 13 4753 eor q6, q6, q2 @ AES block 4k+6 - result 4754 fmov d2, r10 @ CTR block 4k+10 4755 4756 st1 { q6}, [r2], #16 @ AES block 4k+6 - store result 4757 fmov v2.d[1], r9 @ CTR block 4k+10 4758 rev r9, r12 @ CTR block 4k+11 4759 4760 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 4761 orr r9, r11, r9, lsl #32 @ CTR block 4k+11 4762 4763 eor q7, q7, q3 @ AES block 4k+7 - result 4764 st1 { q7}, [r2], #16 @ AES block 4k+7 - store result 4765 blt .L256_enc_main_loop 4766 4767 .L256_enc_prepretail:@ PREPRETAIL 4768 aese q1, v18.16b 4769 aesmc q1, q1 @ AES block 4k+5 - round 0 4770 rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) 4771 4772 aese q2, v18.16b 4773 aesmc q2, q2 @ AES block 4k+6 - round 0 4774 fmov d3, r10 @ CTR block 4k+3 4775 4776 aese q0, v18.16b 4777 aesmc q0, q0 @ AES block 4k+4 - round 0 4778 rev64 q4, q4 @ GHASH block 4k (only t0 is free) 4779 4780 fmov v3.d[1], r9 @ CTR block 4k+3 4781 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 4782 4783 aese q2, v19.16b 4784 aesmc q2, q2 @ AES block 4k+6 - round 1 4785 4786 aese q0, v19.16b 4787 aesmc q0, q0 @ AES block 4k+4 - round 1 4788 4789 eor q4, q4, v11.16b @ PRE 1 4790 rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free) 4791 4792 aese q2, v20.16b 4793 aesmc q2, q2 @ AES block 4k+6 - round 2 4794 4795 aese q3, v18.16b 4796 aesmc q3, q3 @ AES block 4k+7 - round 0 4797 mov d10, v17.d[1] @ GHASH block 4k - mid 4798 4799 aese q1, v19.16b 4800 aesmc q1, q1 @ AES block 4k+5 - round 1 4801 4802 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 4803 mov d8, v4.d[1] @ GHASH block 4k - mid 4804 4805 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 4806 4807 aese q2, v21.16b 4808 aesmc q2, q2 @ AES block 4k+6 - round 3 4809 4810 aese q1, v20.16b 4811 aesmc q1, q1 @ AES block 4k+5 - round 2 4812 eor q8, q8, q4 @ GHASH block 4k - mid 4813 4814 aese q0, v20.16b 4815 aesmc q0, q0 @ AES block 4k+4 - round 2 4816 4817 aese q3, v19.16b 4818 aesmc q3, q3 @ AES block 4k+7 - round 1 4819 4820 aese q1, v21.16b 4821 aesmc q1, q1 @ AES block 4k+5 - round 3 4822 4823 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 4824 4825 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 4826 4827 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 4828 4829 aese q3, v20.16b 4830 aesmc q3, q3 @ AES block 4k+7 - round 2 4831 4832 eor q9, q9, q4 @ GHASH block 4k+1 - high 4833 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 4834 4835 aese q0, v21.16b 4836 aesmc q0, q0 @ AES block 4k+4 - round 3 4837 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 4838 4839 aese q3, v21.16b 4840 aesmc q3, q3 @ AES block 4k+7 - round 3 4841 4842 eor q4, q4, q5 @ GHASH block 4k+1 - mid 4843 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 4844 4845 aese q0, v22.16b 4846 aesmc q0, q0 @ AES block 4k+4 - round 4 4847 rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4848 4849 aese q3, v22.16b 4850 aesmc q3, q3 @ AES block 4k+7 - round 4 4851 4852 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 4853 eor q8, q8, q6 @ GHASH block 4k+2 - mid 4854 add r12, r12, #1 @ CTR block 4k+3 4855 4856 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 4857 4858 aese q3, v23.16b 4859 aesmc q3, q3 @ AES block 4k+7 - round 5 4860 4861 aese q2, v22.16b 4862 aesmc q2, q2 @ AES block 4k+6 - round 4 4863 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 4864 4865 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 4866 4867 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 4868 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 4869 4870 aese q2, v23.16b 4871 aesmc q2, q2 @ AES block 4k+6 - round 5 4872 4873 eor q9, q9, q4 @ GHASH block 4k+2 - high 4874 mov d4, v7.d[1] @ GHASH block 4k+3 - mid 4875 4876 aese q1, v22.16b 4877 aesmc q1, q1 @ AES block 4k+5 - round 4 4878 4879 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 4880 4881 eor q4, q4, q7 @ GHASH block 4k+3 - mid 4882 4883 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 4884 4885 aese q1, v23.16b 4886 aesmc q1, q1 @ AES block 4k+5 - round 5 4887 4888 pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid 4889 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 4890 4891 aese q0, v23.16b 4892 aesmc q0, q0 @ AES block 4k+4 - round 5 4893 4894 aese q1, v24.16b 4895 aesmc q1, q1 @ AES block 4k+5 - round 6 4896 4897 aese q2, v24.16b 4898 aesmc q2, q2 @ AES block 4k+6 - round 6 4899 4900 aese q0, v24.16b 4901 aesmc q0, q0 @ AES block 4k+4 - round 6 4902 movi q8, #0xc2 4903 4904 aese q3, v24.16b 4905 aesmc q3, q3 @ AES block 4k+7 - round 6 4906 4907 aese q1, v25.16b 4908 aesmc q1, q1 @ AES block 4k+5 - round 7 4909 eor q9, q9, q5 @ GHASH block 4k+3 - high 4910 4911 aese q0, v25.16b 4912 aesmc q0, q0 @ AES block 4k+4 - round 7 4913 4914 aese q3, v25.16b 4915 aesmc q3, q3 @ AES block 4k+7 - round 7 4916 shl d8, d8, #56 @ mod_constant 4917 4918 aese q1, v26.16b 4919 aesmc q1, q1 @ AES block 4k+5 - round 8 4920 eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid 4921 4922 pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low 4923 4924 aese q3, v26.16b 4925 aesmc q3, q3 @ AES block 4k+7 - round 8 4926 4927 aese q1, v27.16b 4928 aesmc q1, q1 @ AES block 4k+5 - round 9 4929 4930 aese q0, v26.16b 4931 aesmc q0, q0 @ AES block 4k+4 - round 8 4932 eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low 4933 4934 aese q3, v27.16b 4935 aesmc q3, q3 @ AES block 4k+7 - round 9 4936 4937 eor v10.16b, v10.16b, q9 @ karatsuba tidy up 4938 4939 pmull v4.1q, q9, q8 4940 ext q9, q9, q9, #8 4941 4942 aese q3, v28.16b 4943 aesmc q3, q3 @ AES block 4k+7 - round 10 4944 4945 aese q2, v25.16b 4946 aesmc q2, q2 @ AES block 4k+6 - round 7 4947 eor v10.16b, v10.16b, v11.16b 4948 4949 aese q1, v28.16b 4950 aesmc q1, q1 @ AES block 4k+5 - round 10 4951 4952 aese q0, v27.16b 4953 aesmc q0, q0 @ AES block 4k+4 - round 9 4954 4955 aese q2, v26.16b 4956 aesmc q2, q2 @ AES block 4k+6 - round 8 4957 4958 aese q1, v29.16b 4959 aesmc q1, q1 @ AES block 4k+5 - round 11 4960 eor v10.16b, v10.16b, q4 4961 4962 aese q0, v28.16b 4963 aesmc q0, q0 @ AES block 4k+4 - round 10 4964 4965 aese q2, v27.16b 4966 aesmc q2, q2 @ AES block 4k+6 - round 9 4967 4968 aese q1, v30.16b 4969 aesmc q1, q1 @ AES block 4k+5 - round 12 4970 4971 aese q0, v29.16b 4972 aesmc q0, q0 @ AES block 4k+4 - round 11 4973 eor v10.16b, v10.16b, q9 4974 4975 aese q3, v29.16b 4976 aesmc q3, q3 @ AES block 4k+7 - round 11 4977 4978 aese q2, v28.16b 4979 aesmc q2, q2 @ AES block 4k+6 - round 10 4980 4981 aese q0, v30.16b 4982 aesmc q0, q0 @ AES block 4k+4 - round 12 4983 4984 pmull v4.1q, v10.1d, q8 4985 4986 aese q2, v29.16b 4987 aesmc q2, q2 @ AES block 4k+6 - round 11 4988 ext v10.16b, v10.16b, v10.16b, #8 4989 4990 aese q3, v30.16b 4991 aesmc q3, q3 @ AES block 4k+7 - round 12 4992 4993 aese q1, v31.16b @ AES block 4k+5 - round 13 4994 eor v11.16b, v11.16b, q4 4995 4996 aese q2, v30.16b 4997 aesmc q2, q2 @ AES block 4k+6 - round 12 4998 4999 aese q3, v31.16b @ AES block 4k+7 - round 13 5000 5001 aese q0, v31.16b @ AES block 4k+4 - round 13 5002 5003 aese q2, v31.16b @ AES block 4k+6 - round 13 5004 eor v11.16b, v11.16b, v10.16b 5005 .L256_enc_tail:@ TAIL 5006 5007 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 5008 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 5009 ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext 5010 #ifdef __ARMEB__ 5011 rev r6, r6 5012 rev r7, r7 5013 #endif 5014 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 5015 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 5016 5017 cmp r5, #48 5018 fmov d4, r6 @ AES block 4k+4 - mov low 5019 5020 fmov v4.d[1], r7 @ AES block 4k+4 - mov high 5021 5022 eor q5, q4, q0 @ AES block 4k+4 - result 5023 bgt .L256_enc_blocks_more_than_3 5024 5025 cmp r5, #32 5026 mov q3, q2 5027 movi v11.8b, #0 5028 5029 movi q9, #0 5030 sub r12, r12, #1 5031 5032 mov q2, q1 5033 movi v10.8b, #0 5034 bgt .L256_enc_blocks_more_than_2 5035 5036 mov q3, q1 5037 sub r12, r12, #1 5038 cmp r5, #16 5039 5040 bgt .L256_enc_blocks_more_than_1 5041 5042 sub r12, r12, #1 5043 b .L256_enc_blocks_less_than_1 5044 .L256_enc_blocks_more_than_3:@ blocks left > 3 5045 st1 { q5}, [r2], #16 @ AES final-3 block - store result 5046 5047 ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high 5048 #ifdef __ARMEB__ 5049 rev r6, r6 5050 rev r7, r7 5051 #endif 5052 rev64 q4, q5 @ GHASH final-3 block 5053 5054 eor r6, r6, r13 @ AES final-2 block - round 14 low 5055 eor q4, q4, q8 @ feed in partial tag 5056 5057 eor r7, r7, r14 @ AES final-2 block - round 14 high 5058 5059 mov d22, v4.d[1] @ GHASH final-3 block - mid 5060 fmov d5, r6 @ AES final-2 block - mov low 5061 5062 fmov v5.d[1], r7 @ AES final-2 block - mov high 5063 5064 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 5065 movi q8, #0 @ suppress further partial tag feed in 5066 5067 mov d10, v17.d[1] @ GHASH final-3 block - mid 5068 5069 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 5070 5071 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 5072 5073 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 5074 eor q5, q5, q1 @ AES final-2 block - result 5075 .L256_enc_blocks_more_than_2:@ blocks left > 2 5076 5077 st1 { q5}, [r2], #16 @ AES final-2 block - store result 5078 5079 ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high 5080 #ifdef __ARMEB__ 5081 rev r6, r6 5082 rev r7, r7 5083 #endif 5084 rev64 q4, q5 @ GHASH final-2 block 5085 5086 eor r6, r6, r13 @ AES final-1 block - round 14 low 5087 eor q4, q4, q8 @ feed in partial tag 5088 5089 fmov d5, r6 @ AES final-1 block - mov low 5090 eor r7, r7, r14 @ AES final-1 block - round 14 high 5091 5092 fmov v5.d[1], r7 @ AES final-1 block - mov high 5093 5094 movi q8, #0 @ suppress further partial tag feed in 5095 5096 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 5097 mov d22, v4.d[1] @ GHASH final-2 block - mid 5098 5099 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 5100 5101 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 5102 5103 eor q5, q5, q2 @ AES final-1 block - result 5104 5105 eor q9, q9, v20.16b @ GHASH final-2 block - high 5106 5107 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 5108 5109 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 5110 5111 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 5112 .L256_enc_blocks_more_than_1:@ blocks left > 1 5113 5114 st1 { q5}, [r2], #16 @ AES final-1 block - store result 5115 5116 rev64 q4, q5 @ GHASH final-1 block 5117 5118 ldp r6, r7, [r0], #16 @ AES final block - load input low & high 5119 #ifdef __ARMEB__ 5120 rev r6, r6 5121 rev r7, r7 5122 #endif 5123 eor q4, q4, q8 @ feed in partial tag 5124 5125 movi q8, #0 @ suppress further partial tag feed in 5126 5127 eor r6, r6, r13 @ AES final block - round 14 low 5128 mov d22, v4.d[1] @ GHASH final-1 block - mid 5129 5130 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 5131 eor r7, r7, r14 @ AES final block - round 14 high 5132 5133 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 5134 5135 eor q9, q9, v20.16b @ GHASH final-1 block - high 5136 5137 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 5138 fmov d5, r6 @ AES final block - mov low 5139 5140 fmov v5.d[1], r7 @ AES final block - mov high 5141 5142 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 5143 5144 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 5145 5146 eor q5, q5, q3 @ AES final block - result 5147 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 5148 5149 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 5150 .L256_enc_blocks_less_than_1:@ blocks left <= 1 5151 5152 and r1, r1, #127 @ bit_length %= 128 5153 5154 mvn r13, xzr @ rk14_l = 0xffffffffffffffff 5155 sub r1, r1, #128 @ bit_length -= 128 5156 5157 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 5158 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored 5159 5160 mvn r14, xzr @ rk14_h = 0xffffffffffffffff 5161 and r1, r1, #127 @ bit_length %= 128 5162 5163 lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block 5164 cmp r1, #64 5165 5166 csel r6, r13, r14, lt 5167 csel r7, r14, xzr, lt 5168 5169 fmov d0, r6 @ ctr0b is mask for last block 5170 5171 fmov v0.d[1], r7 5172 5173 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 5174 5175 rev64 q4, q5 @ GHASH final block 5176 5177 eor q4, q4, q8 @ feed in partial tag 5178 5179 bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing 5180 5181 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 5182 mov d8, v4.d[1] @ GHASH final block - mid 5183 #ifndef __ARMEB__ 5184 rev r9, r12 5185 #else 5186 mov r9, r12 5187 #endif 5188 5189 pmull v21.1q, q4, v12.1d @ GHASH final block - low 5190 5191 eor q9, q9, v20.16b @ GHASH final block - high 5192 eor q8, q8, q4 @ GHASH final block - mid 5193 5194 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 5195 5196 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 5197 5198 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 5199 movi q8, #0xc2 5200 5201 eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up 5202 5203 shl d8, d8, #56 @ mod_constant 5204 5205 eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up 5206 5207 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 5208 5209 ext q9, q9, q9, #8 @ MODULO - other top alignment 5210 5211 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 5212 5213 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 5214 5215 pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low 5216 5217 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 5218 5219 str r9, [r16, #12] @ store the updated counter 5220 5221 st1 { q5}, [r2] @ store all 16B 5222 eor v11.16b, v11.16b, q9 @ MODULO - fold into low 5223 5224 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 5225 ext v11.16b, v11.16b, v11.16b, #8 5226 rev64 v11.16b, v11.16b 5227 mov r0, r15 5228 st1 { v11.16b }, [r3] 5229 5230 ldp r21, r22, [sp, #16] 5231 ldp r23, r24, [sp, #32] 5232 ldp d8, d9, [sp, #48] 5233 ldp d10, d11, [sp, #64] 5234 ldp d12, d13, [sp, #80] 5235 ldp d14, d15, [sp, #96] 5236 ldp r19, r20, [sp], #112 5237 RET 5238 5239 .L256_enc_ret: 5240 mov r0, #0x0 5241 RET 5242 .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5243 .globl aes_gcm_dec_256_kernel 5244 .type aes_gcm_dec_256_kernel,%function 5245 .align 4 5246 aes_gcm_dec_256_kernel: 5247 AARCH64_VALID_CALL_TARGET 5248 cbz r1, .L256_dec_ret 5249 stp r19, r20, [sp, #-112]! 5250 mov r16, r4 5251 mov r8, r5 5252 stp r21, r22, [sp, #16] 5253 stp r23, r24, [sp, #32] 5254 stp d8, d9, [sp, #48] 5255 stp d10, d11, [sp, #64] 5256 stp d12, d13, [sp, #80] 5257 stp d14, d15, [sp, #96] 5258 5259 lsr r5, r1, #3 @ byte_len 5260 mov r15, r5 5261 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 5262 #ifdef __ARMEB__ 5263 rev r10, r10 5264 rev r11, r11 5265 #endif 5266 ldp r13, r14, [r8, #224] @ load rk14 5267 #ifdef __ARMEB__ 5268 ror r14, r14, #32 5269 ror r13, r13, #32 5270 #endif 5271 ld1 {v18.4s}, [r8], #16 @ load rk0 5272 sub r5, r5, #1 @ byte_len - 1 5273 5274 ld1 {v19.4s}, [r8], #16 @ load rk1 5275 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5276 5277 add r4, r0, r1, lsr #3 @ end_input_ptr 5278 ld1 {v20.4s}, [r8], #16 @ load rk2 5279 5280 lsr r12, r11, #32 5281 ld1 {v21.4s}, [r8], #16 @ load rk3 5282 orr r11, r11, r11 5283 5284 ld1 {v22.4s}, [r8], #16 @ load rk4 5285 add r5, r5, r0 5286 rev r12, r12 @ rev_ctr32 5287 5288 add r12, r12, #1 @ increment rev_ctr32 5289 fmov d3, r10 @ CTR block 3 5290 5291 rev r9, r12 @ CTR block 1 5292 add r12, r12, #1 @ CTR block 1 5293 fmov d1, r10 @ CTR block 1 5294 5295 orr r9, r11, r9, lsl #32 @ CTR block 1 5296 ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible 5297 5298 fmov v1.d[1], r9 @ CTR block 1 5299 rev r9, r12 @ CTR block 2 5300 add r12, r12, #1 @ CTR block 2 5301 5302 fmov d2, r10 @ CTR block 2 5303 orr r9, r11, r9, lsl #32 @ CTR block 2 5304 5305 fmov v2.d[1], r9 @ CTR block 2 5306 rev r9, r12 @ CTR block 3 5307 5308 orr r9, r11, r9, lsl #32 @ CTR block 3 5309 ld1 {v23.4s}, [r8], #16 @ load rk5 5310 5311 fmov v3.d[1], r9 @ CTR block 3 5312 add r12, r12, #1 @ CTR block 3 5313 5314 ld1 {v24.4s}, [r8], #16 @ load rk6 5315 5316 ld1 {v25.4s}, [r8], #16 @ load rk7 5317 5318 ld1 {v26.4s}, [r8], #16 @ load rk8 5319 5320 aese q0, v18.16b 5321 aesmc q0, q0 @ AES block 0 - round 0 5322 ldr q14, [r3, #80] @ load h3l | h3h 5323 #ifndef __ARMEB__ 5324 ext v14.16b, v14.16b, v14.16b, #8 5325 #endif 5326 5327 aese q3, v18.16b 5328 aesmc q3, q3 @ AES block 3 - round 0 5329 ldr q15, [r3, #112] @ load h4l | h4h 5330 #ifndef __ARMEB__ 5331 ext v15.16b, v15.16b, v15.16b, #8 5332 #endif 5333 5334 aese q1, v18.16b 5335 aesmc q1, q1 @ AES block 1 - round 0 5336 ldr q13, [r3, #64] @ load h2l | h2h 5337 #ifndef __ARMEB__ 5338 ext v13.16b, v13.16b, v13.16b, #8 5339 #endif 5340 5341 aese q2, v18.16b 5342 aesmc q2, q2 @ AES block 2 - round 0 5343 ld1 {v27.4s}, [r8], #16 @ load rk9 5344 5345 aese q0, v19.16b 5346 aesmc q0, q0 @ AES block 0 - round 1 5347 5348 aese q1, v19.16b 5349 aesmc q1, q1 @ AES block 1 - round 1 5350 ld1 { v11.16b}, [r3] 5351 ext v11.16b, v11.16b, v11.16b, #8 5352 rev64 v11.16b, v11.16b 5353 5354 aese q2, v19.16b 5355 aesmc q2, q2 @ AES block 2 - round 1 5356 ld1 {v28.4s}, [r8], #16 @ load rk10 5357 5358 aese q3, v19.16b 5359 aesmc q3, q3 @ AES block 3 - round 1 5360 ld1 {v29.4s}, [r8], #16 @ load rk11 5361 5362 aese q0, v20.16b 5363 aesmc q0, q0 @ AES block 0 - round 2 5364 ldr q12, [r3, #32] @ load h1l | h1h 5365 #ifndef __ARMEB__ 5366 ext v12.16b, v12.16b, v12.16b, #8 5367 #endif 5368 aese q2, v20.16b 5369 aesmc q2, q2 @ AES block 2 - round 2 5370 ld1 {v30.4s}, [r8], #16 @ load rk12 5371 5372 aese q3, v20.16b 5373 aesmc q3, q3 @ AES block 3 - round 2 5374 5375 aese q0, v21.16b 5376 aesmc q0, q0 @ AES block 0 - round 3 5377 5378 aese q1, v20.16b 5379 aesmc q1, q1 @ AES block 1 - round 2 5380 5381 aese q3, v21.16b 5382 aesmc q3, q3 @ AES block 3 - round 3 5383 5384 aese q0, v22.16b 5385 aesmc q0, q0 @ AES block 0 - round 4 5386 cmp r0, r5 @ check if we have <= 4 blocks 5387 5388 aese q2, v21.16b 5389 aesmc q2, q2 @ AES block 2 - round 3 5390 5391 aese q1, v21.16b 5392 aesmc q1, q1 @ AES block 1 - round 3 5393 5394 aese q3, v22.16b 5395 aesmc q3, q3 @ AES block 3 - round 4 5396 5397 aese q2, v22.16b 5398 aesmc q2, q2 @ AES block 2 - round 4 5399 5400 aese q1, v22.16b 5401 aesmc q1, q1 @ AES block 1 - round 4 5402 5403 aese q3, v23.16b 5404 aesmc q3, q3 @ AES block 3 - round 5 5405 5406 aese q0, v23.16b 5407 aesmc q0, q0 @ AES block 0 - round 5 5408 5409 aese q1, v23.16b 5410 aesmc q1, q1 @ AES block 1 - round 5 5411 5412 aese q2, v23.16b 5413 aesmc q2, q2 @ AES block 2 - round 5 5414 5415 aese q0, v24.16b 5416 aesmc q0, q0 @ AES block 0 - round 6 5417 5418 aese q3, v24.16b 5419 aesmc q3, q3 @ AES block 3 - round 6 5420 5421 aese q1, v24.16b 5422 aesmc q1, q1 @ AES block 1 - round 6 5423 5424 aese q2, v24.16b 5425 aesmc q2, q2 @ AES block 2 - round 6 5426 5427 aese q0, v25.16b 5428 aesmc q0, q0 @ AES block 0 - round 7 5429 5430 aese q1, v25.16b 5431 aesmc q1, q1 @ AES block 1 - round 7 5432 5433 aese q3, v25.16b 5434 aesmc q3, q3 @ AES block 3 - round 7 5435 5436 aese q0, v26.16b 5437 aesmc q0, q0 @ AES block 0 - round 8 5438 5439 aese q2, v25.16b 5440 aesmc q2, q2 @ AES block 2 - round 7 5441 5442 aese q3, v26.16b 5443 aesmc q3, q3 @ AES block 3 - round 8 5444 5445 aese q1, v26.16b 5446 aesmc q1, q1 @ AES block 1 - round 8 5447 5448 aese q0, v27.16b 5449 aesmc q0, q0 @ AES block 0 - round 9 5450 5451 aese q2, v26.16b 5452 aesmc q2, q2 @ AES block 2 - round 8 5453 ld1 {v31.4s}, [r8], #16 @ load rk13 5454 5455 aese q1, v27.16b 5456 aesmc q1, q1 @ AES block 1 - round 9 5457 5458 aese q0, v28.16b 5459 aesmc q0, q0 @ AES block 0 - round 10 5460 5461 aese q3, v27.16b 5462 aesmc q3, q3 @ AES block 3 - round 9 5463 5464 aese q1, v28.16b 5465 aesmc q1, q1 @ AES block 1 - round 10 5466 5467 aese q2, v27.16b 5468 aesmc q2, q2 @ AES block 2 - round 9 5469 5470 aese q3, v28.16b 5471 aesmc q3, q3 @ AES block 3 - round 10 5472 5473 aese q0, v29.16b 5474 aesmc q0, q0 @ AES block 0 - round 11 5475 5476 aese q2, v28.16b 5477 aesmc q2, q2 @ AES block 2 - round 10 5478 5479 aese q3, v29.16b 5480 aesmc q3, q3 @ AES block 3 - round 11 5481 5482 aese q1, v29.16b 5483 aesmc q1, q1 @ AES block 1 - round 11 5484 5485 aese q2, v29.16b 5486 aesmc q2, q2 @ AES block 2 - round 11 5487 5488 trn1 q9, v14.2d, v15.2d @ h4h | h3h 5489 5490 trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l 5491 5492 trn1 q8, v12.2d, v13.2d @ h2h | h1h 5493 trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l 5494 5495 aese q1, v30.16b 5496 aesmc q1, q1 @ AES block 1 - round 12 5497 5498 aese q0, v30.16b 5499 aesmc q0, q0 @ AES block 0 - round 12 5500 5501 aese q2, v30.16b 5502 aesmc q2, q2 @ AES block 2 - round 12 5503 5504 aese q3, v30.16b 5505 aesmc q3, q3 @ AES block 3 - round 12 5506 eor v17.16b, v17.16b, q9 @ h4k | h3k 5507 5508 aese q1, v31.16b @ AES block 1 - round 13 5509 5510 aese q2, v31.16b @ AES block 2 - round 13 5511 eor v16.16b, v16.16b, q8 @ h2k | h1k 5512 5513 aese q3, v31.16b @ AES block 3 - round 13 5514 5515 aese q0, v31.16b @ AES block 0 - round 13 5516 bge .L256_dec_tail @ handle tail 5517 5518 ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext 5519 5520 rev r9, r12 @ CTR block 4 5521 5522 eor q0, q4, q0 @ AES block 0 - result 5523 5524 eor q1, q5, q1 @ AES block 1 - result 5525 rev64 q5, q5 @ GHASH block 1 5526 ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext 5527 5528 mov r7, v0.d[1] @ AES block 0 - mov high 5529 5530 mov r6, v0.d[0] @ AES block 0 - mov low 5531 rev64 q4, q4 @ GHASH block 0 5532 add r12, r12, #1 @ CTR block 4 5533 5534 fmov d0, r10 @ CTR block 4 5535 orr r9, r11, r9, lsl #32 @ CTR block 4 5536 5537 fmov v0.d[1], r9 @ CTR block 4 5538 rev r9, r12 @ CTR block 5 5539 add r12, r12, #1 @ CTR block 5 5540 5541 mov r19, v1.d[0] @ AES block 1 - mov low 5542 5543 orr r9, r11, r9, lsl #32 @ CTR block 5 5544 mov r20, v1.d[1] @ AES block 1 - mov high 5545 eor r7, r7, r14 @ AES block 0 - round 14 high 5546 #ifdef __ARMEB__ 5547 rev r7, r7 5548 #endif 5549 eor r6, r6, r13 @ AES block 0 - round 14 low 5550 #ifdef __ARMEB__ 5551 rev r6, r6 5552 #endif 5553 stp r6, r7, [r2], #16 @ AES block 0 - store result 5554 fmov d1, r10 @ CTR block 5 5555 5556 ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext 5557 5558 fmov v1.d[1], r9 @ CTR block 5 5559 rev r9, r12 @ CTR block 6 5560 add r12, r12, #1 @ CTR block 6 5561 5562 eor r19, r19, r13 @ AES block 1 - round 14 low 5563 #ifdef __ARMEB__ 5564 rev r19, r19 5565 #endif 5566 orr r9, r11, r9, lsl #32 @ CTR block 6 5567 5568 eor r20, r20, r14 @ AES block 1 - round 14 high 5569 #ifdef __ARMEB__ 5570 rev r20, r20 5571 #endif 5572 stp r19, r20, [r2], #16 @ AES block 1 - store result 5573 5574 eor q2, q6, q2 @ AES block 2 - result 5575 cmp r0, r5 @ check if we have <= 8 blocks 5576 bge .L256_dec_prepretail @ do prepretail 5577 5578 .L256_dec_main_loop:@ main loop start 5579 mov r21, v2.d[0] @ AES block 4k+2 - mov low 5580 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 5581 eor q3, q7, q3 @ AES block 4k+3 - result 5582 5583 aese q0, v18.16b 5584 aesmc q0, q0 @ AES block 4k+4 - round 0 5585 mov r22, v2.d[1] @ AES block 4k+2 - mov high 5586 5587 aese q1, v18.16b 5588 aesmc q1, q1 @ AES block 4k+5 - round 0 5589 fmov d2, r10 @ CTR block 4k+6 5590 5591 fmov v2.d[1], r9 @ CTR block 4k+6 5592 eor q4, q4, v11.16b @ PRE 1 5593 rev r9, r12 @ CTR block 4k+7 5594 5595 aese q0, v19.16b 5596 aesmc q0, q0 @ AES block 4k+4 - round 1 5597 mov r24, v3.d[1] @ AES block 4k+3 - mov high 5598 5599 aese q1, v19.16b 5600 aesmc q1, q1 @ AES block 4k+5 - round 1 5601 mov r23, v3.d[0] @ AES block 4k+3 - mov low 5602 5603 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 5604 mov d8, v4.d[1] @ GHASH block 4k - mid 5605 fmov d3, r10 @ CTR block 4k+7 5606 5607 aese q0, v20.16b 5608 aesmc q0, q0 @ AES block 4k+4 - round 2 5609 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 5610 5611 aese q2, v18.16b 5612 aesmc q2, q2 @ AES block 4k+6 - round 0 5613 fmov v3.d[1], r9 @ CTR block 4k+7 5614 5615 aese q1, v20.16b 5616 aesmc q1, q1 @ AES block 4k+5 - round 2 5617 eor q8, q8, q4 @ GHASH block 4k - mid 5618 5619 aese q0, v21.16b 5620 aesmc q0, q0 @ AES block 4k+4 - round 3 5621 eor r22, r22, r14 @ AES block 4k+2 - round 14 high 5622 #ifdef __ARMEB__ 5623 rev r22, r22 5624 #endif 5625 aese q2, v19.16b 5626 aesmc q2, q2 @ AES block 4k+6 - round 1 5627 mov d10, v17.d[1] @ GHASH block 4k - mid 5628 5629 aese q1, v21.16b 5630 aesmc q1, q1 @ AES block 4k+5 - round 3 5631 rev64 q6, q6 @ GHASH block 4k+2 5632 5633 aese q3, v18.16b 5634 aesmc q3, q3 @ AES block 4k+7 - round 0 5635 eor r21, r21, r13 @ AES block 4k+2 - round 14 low 5636 #ifdef __ARMEB__ 5637 rev r21, r21 5638 #endif 5639 aese q2, v20.16b 5640 aesmc q2, q2 @ AES block 4k+6 - round 2 5641 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 5642 5643 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 5644 5645 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 5646 5647 aese q2, v21.16b 5648 aesmc q2, q2 @ AES block 4k+6 - round 3 5649 rev64 q7, q7 @ GHASH block 4k+3 5650 5651 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 5652 eor r23, r23, r13 @ AES block 4k+3 - round 14 low 5653 #ifdef __ARMEB__ 5654 rev r23, r23 5655 #endif 5656 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 5657 eor r24, r24, r14 @ AES block 4k+3 - round 14 high 5658 #ifdef __ARMEB__ 5659 rev r24, r24 5660 #endif 5661 eor q9, q9, q4 @ GHASH block 4k+1 - high 5662 5663 aese q2, v22.16b 5664 aesmc q2, q2 @ AES block 4k+6 - round 4 5665 5666 aese q3, v19.16b 5667 aesmc q3, q3 @ AES block 4k+7 - round 1 5668 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 5669 5670 aese q0, v22.16b 5671 aesmc q0, q0 @ AES block 4k+4 - round 4 5672 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 5673 5674 aese q2, v23.16b 5675 aesmc q2, q2 @ AES block 4k+6 - round 5 5676 add r12, r12, #1 @ CTR block 4k+7 5677 5678 aese q3, v20.16b 5679 aesmc q3, q3 @ AES block 4k+7 - round 2 5680 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 5681 5682 aese q1, v22.16b 5683 aesmc q1, q1 @ AES block 4k+5 - round 4 5684 eor q4, q4, q5 @ GHASH block 4k+1 - mid 5685 5686 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 5687 5688 aese q3, v21.16b 5689 aesmc q3, q3 @ AES block 4k+7 - round 3 5690 eor q8, q8, q6 @ GHASH block 4k+2 - mid 5691 5692 aese q1, v23.16b 5693 aesmc q1, q1 @ AES block 4k+5 - round 5 5694 5695 aese q0, v23.16b 5696 aesmc q0, q0 @ AES block 4k+4 - round 5 5697 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 5698 5699 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 5700 rev r9, r12 @ CTR block 4k+8 5701 5702 aese q1, v24.16b 5703 aesmc q1, q1 @ AES block 4k+5 - round 6 5704 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 5705 5706 aese q0, v24.16b 5707 aesmc q0, q0 @ AES block 4k+4 - round 6 5708 add r12, r12, #1 @ CTR block 4k+8 5709 5710 aese q3, v22.16b 5711 aesmc q3, q3 @ AES block 4k+7 - round 4 5712 5713 aese q1, v25.16b 5714 aesmc q1, q1 @ AES block 4k+5 - round 7 5715 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 5716 5717 aese q0, v25.16b 5718 aesmc q0, q0 @ AES block 4k+4 - round 7 5719 5720 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 5721 mov d6, v7.d[1] @ GHASH block 4k+3 - mid 5722 5723 aese q3, v23.16b 5724 aesmc q3, q3 @ AES block 4k+7 - round 5 5725 5726 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 5727 5728 aese q0, v26.16b 5729 aesmc q0, q0 @ AES block 4k+4 - round 8 5730 eor q9, q9, q4 @ GHASH block 4k+2 - high 5731 5732 aese q3, v24.16b 5733 aesmc q3, q3 @ AES block 4k+7 - round 6 5734 5735 pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low 5736 orr r9, r11, r9, lsl #32 @ CTR block 4k+8 5737 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 5738 5739 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 5740 5741 aese q0, v27.16b 5742 aesmc q0, q0 @ AES block 4k+4 - round 9 5743 eor q6, q6, q7 @ GHASH block 4k+3 - mid 5744 5745 aese q1, v26.16b 5746 aesmc q1, q1 @ AES block 4k+5 - round 8 5747 5748 aese q2, v24.16b 5749 aesmc q2, q2 @ AES block 4k+6 - round 6 5750 eor q9, q9, q5 @ GHASH block 4k+3 - high 5751 5752 aese q0, v28.16b 5753 aesmc q0, q0 @ AES block 4k+4 - round 10 5754 5755 pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid 5756 movi q8, #0xc2 5757 5758 aese q2, v25.16b 5759 aesmc q2, q2 @ AES block 4k+6 - round 7 5760 eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low 5761 5762 aese q0, v29.16b 5763 aesmc q0, q0 @ AES block 4k+4 - round 11 5764 5765 aese q3, v25.16b 5766 aesmc q3, q3 @ AES block 4k+7 - round 7 5767 shl d8, d8, #56 @ mod_constant 5768 5769 aese q2, v26.16b 5770 aesmc q2, q2 @ AES block 4k+6 - round 8 5771 eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid 5772 5773 aese q0, v30.16b 5774 aesmc q0, q0 @ AES block 4k+4 - round 12 5775 5776 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 5777 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up 5778 5779 aese q1, v27.16b 5780 aesmc q1, q1 @ AES block 4k+5 - round 9 5781 ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext 5782 5783 aese q0, v31.16b @ AES block 4k+4 - round 13 5784 ext q9, q9, q9, #8 @ MODULO - other top alignment 5785 5786 aese q1, v28.16b 5787 aesmc q1, q1 @ AES block 4k+5 - round 10 5788 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up 5789 5790 aese q2, v27.16b 5791 aesmc q2, q2 @ AES block 4k+6 - round 9 5792 ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext 5793 5794 aese q3, v26.16b 5795 aesmc q3, q3 @ AES block 4k+7 - round 8 5796 eor q0, q4, q0 @ AES block 4k+4 - result 5797 5798 aese q1, v29.16b 5799 aesmc q1, q1 @ AES block 4k+5 - round 11 5800 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 5801 5802 aese q2, v28.16b 5803 aesmc q2, q2 @ AES block 4k+6 - round 10 5804 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 5805 5806 aese q3, v27.16b 5807 aesmc q3, q3 @ AES block 4k+7 - round 9 5808 ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext 5809 5810 aese q1, v30.16b 5811 aesmc q1, q1 @ AES block 4k+5 - round 12 5812 ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext 5813 5814 aese q2, v29.16b 5815 aesmc q2, q2 @ AES block 4k+6 - round 11 5816 mov r7, v0.d[1] @ AES block 4k+4 - mov high 5817 5818 aese q3, v28.16b 5819 aesmc q3, q3 @ AES block 4k+7 - round 10 5820 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 5821 5822 aese q1, v31.16b @ AES block 4k+5 - round 13 5823 mov r6, v0.d[0] @ AES block 4k+4 - mov low 5824 5825 aese q2, v30.16b 5826 aesmc q2, q2 @ AES block 4k+6 - round 12 5827 fmov d0, r10 @ CTR block 4k+8 5828 5829 aese q3, v29.16b 5830 aesmc q3, q3 @ AES block 4k+7 - round 11 5831 fmov v0.d[1], r9 @ CTR block 4k+8 5832 5833 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 5834 eor q1, q5, q1 @ AES block 4k+5 - result 5835 rev r9, r12 @ CTR block 4k+9 5836 5837 aese q2, v31.16b @ AES block 4k+6 - round 13 5838 orr r9, r11, r9, lsl #32 @ CTR block 4k+9 5839 cmp r0, r5 @ .LOOP CONTROL 5840 5841 add r12, r12, #1 @ CTR block 4k+9 5842 5843 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 5844 #ifdef __ARMEB__ 5845 rev r6, r6 5846 #endif 5847 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 5848 #ifdef __ARMEB__ 5849 rev r7, r7 5850 #endif 5851 mov r20, v1.d[1] @ AES block 4k+5 - mov high 5852 eor q2, q6, q2 @ AES block 4k+6 - result 5853 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 5854 5855 aese q3, v30.16b 5856 aesmc q3, q3 @ AES block 4k+7 - round 12 5857 mov r19, v1.d[0] @ AES block 4k+5 - mov low 5858 5859 fmov d1, r10 @ CTR block 4k+9 5860 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 5861 5862 fmov v1.d[1], r9 @ CTR block 4k+9 5863 rev r9, r12 @ CTR block 4k+10 5864 add r12, r12, #1 @ CTR block 4k+10 5865 5866 aese q3, v31.16b @ AES block 4k+7 - round 13 5867 orr r9, r11, r9, lsl #32 @ CTR block 4k+10 5868 5869 rev64 q5, q5 @ GHASH block 4k+5 5870 eor r20, r20, r14 @ AES block 4k+5 - round 14 high 5871 #ifdef __ARMEB__ 5872 rev r20, r20 5873 #endif 5874 stp r6, r7, [r2], #16 @ AES block 4k+4 - store result 5875 5876 eor r19, r19, r13 @ AES block 4k+5 - round 14 low 5877 #ifdef __ARMEB__ 5878 rev r19, r19 5879 #endif 5880 stp r19, r20, [r2], #16 @ AES block 4k+5 - store result 5881 5882 rev64 q4, q4 @ GHASH block 4k+4 5883 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 5884 blt .L256_dec_main_loop 5885 5886 5887 .L256_dec_prepretail:@ PREPRETAIL 5888 ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 5889 mov r21, v2.d[0] @ AES block 4k+2 - mov low 5890 eor q3, q7, q3 @ AES block 4k+3 - result 5891 5892 aese q0, v18.16b 5893 aesmc q0, q0 @ AES block 4k+4 - round 0 5894 mov r22, v2.d[1] @ AES block 4k+2 - mov high 5895 5896 aese q1, v18.16b 5897 aesmc q1, q1 @ AES block 4k+5 - round 0 5898 fmov d2, r10 @ CTR block 4k+6 5899 5900 fmov v2.d[1], r9 @ CTR block 4k+6 5901 rev r9, r12 @ CTR block 4k+7 5902 eor q4, q4, v11.16b @ PRE 1 5903 5904 rev64 q6, q6 @ GHASH block 4k+2 5905 orr r9, r11, r9, lsl #32 @ CTR block 4k+7 5906 mov r23, v3.d[0] @ AES block 4k+3 - mov low 5907 5908 aese q1, v19.16b 5909 aesmc q1, q1 @ AES block 4k+5 - round 1 5910 mov r24, v3.d[1] @ AES block 4k+3 - mov high 5911 5912 pmull v11.1q, q4, v15.1d @ GHASH block 4k - low 5913 mov d8, v4.d[1] @ GHASH block 4k - mid 5914 fmov d3, r10 @ CTR block 4k+7 5915 5916 pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high 5917 fmov v3.d[1], r9 @ CTR block 4k+7 5918 5919 aese q2, v18.16b 5920 aesmc q2, q2 @ AES block 4k+6 - round 0 5921 mov d10, v17.d[1] @ GHASH block 4k - mid 5922 5923 aese q0, v19.16b 5924 aesmc q0, q0 @ AES block 4k+4 - round 1 5925 eor q8, q8, q4 @ GHASH block 4k - mid 5926 5927 pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high 5928 5929 aese q2, v19.16b 5930 aesmc q2, q2 @ AES block 4k+6 - round 1 5931 rev64 q7, q7 @ GHASH block 4k+3 5932 5933 aese q3, v18.16b 5934 aesmc q3, q3 @ AES block 4k+7 - round 0 5935 5936 pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid 5937 eor q9, q9, q4 @ GHASH block 4k+1 - high 5938 5939 pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low 5940 5941 aese q3, v19.16b 5942 aesmc q3, q3 @ AES block 4k+7 - round 1 5943 mov d4, v5.d[1] @ GHASH block 4k+1 - mid 5944 5945 aese q0, v20.16b 5946 aesmc q0, q0 @ AES block 4k+4 - round 2 5947 5948 aese q1, v20.16b 5949 aesmc q1, q1 @ AES block 4k+5 - round 2 5950 eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low 5951 5952 aese q2, v20.16b 5953 aesmc q2, q2 @ AES block 4k+6 - round 2 5954 5955 aese q0, v21.16b 5956 aesmc q0, q0 @ AES block 4k+4 - round 3 5957 mov d8, v6.d[1] @ GHASH block 4k+2 - mid 5958 5959 aese q3, v20.16b 5960 aesmc q3, q3 @ AES block 4k+7 - round 2 5961 eor q4, q4, q5 @ GHASH block 4k+1 - mid 5962 5963 pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low 5964 5965 aese q0, v22.16b 5966 aesmc q0, q0 @ AES block 4k+4 - round 4 5967 5968 aese q3, v21.16b 5969 aesmc q3, q3 @ AES block 4k+7 - round 3 5970 eor q8, q8, q6 @ GHASH block 4k+2 - mid 5971 5972 pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid 5973 5974 aese q0, v23.16b 5975 aesmc q0, q0 @ AES block 4k+4 - round 5 5976 eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low 5977 5978 aese q3, v22.16b 5979 aesmc q3, q3 @ AES block 4k+7 - round 4 5980 5981 pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high 5982 eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid 5983 5984 pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high 5985 5986 aese q3, v23.16b 5987 aesmc q3, q3 @ AES block 4k+7 - round 5 5988 ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid 5989 5990 aese q2, v21.16b 5991 aesmc q2, q2 @ AES block 4k+6 - round 3 5992 5993 aese q1, v21.16b 5994 aesmc q1, q1 @ AES block 4k+5 - round 3 5995 eor q9, q9, q4 @ GHASH block 4k+2 - high 5996 5997 pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low 5998 5999 aese q2, v22.16b 6000 aesmc q2, q2 @ AES block 4k+6 - round 4 6001 mov d6, v7.d[1] @ GHASH block 4k+3 - mid 6002 6003 aese q1, v22.16b 6004 aesmc q1, q1 @ AES block 4k+5 - round 4 6005 6006 pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid 6007 6008 aese q2, v23.16b 6009 aesmc q2, q2 @ AES block 4k+6 - round 5 6010 eor q6, q6, q7 @ GHASH block 4k+3 - mid 6011 6012 aese q1, v23.16b 6013 aesmc q1, q1 @ AES block 4k+5 - round 5 6014 6015 aese q3, v24.16b 6016 aesmc q3, q3 @ AES block 4k+7 - round 6 6017 eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid 6018 6019 aese q2, v24.16b 6020 aesmc q2, q2 @ AES block 4k+6 - round 6 6021 6022 aese q0, v24.16b 6023 aesmc q0, q0 @ AES block 4k+4 - round 6 6024 movi q8, #0xc2 6025 6026 aese q1, v24.16b 6027 aesmc q1, q1 @ AES block 4k+5 - round 6 6028 eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low 6029 6030 pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid 6031 6032 aese q3, v25.16b 6033 aesmc q3, q3 @ AES block 4k+7 - round 7 6034 eor q9, q9, q5 @ GHASH block 4k+3 - high 6035 6036 aese q1, v25.16b 6037 aesmc q1, q1 @ AES block 4k+5 - round 7 6038 6039 aese q0, v25.16b 6040 aesmc q0, q0 @ AES block 4k+4 - round 7 6041 eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid 6042 6043 aese q3, v26.16b 6044 aesmc q3, q3 @ AES block 4k+7 - round 8 6045 6046 aese q2, v25.16b 6047 aesmc q2, q2 @ AES block 4k+6 - round 7 6048 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up 6049 6050 aese q1, v26.16b 6051 aesmc q1, q1 @ AES block 4k+5 - round 8 6052 6053 aese q0, v26.16b 6054 aesmc q0, q0 @ AES block 4k+4 - round 8 6055 shl d8, d8, #56 @ mod_constant 6056 6057 aese q2, v26.16b 6058 aesmc q2, q2 @ AES block 4k+6 - round 8 6059 6060 aese q1, v27.16b 6061 aesmc q1, q1 @ AES block 4k+5 - round 9 6062 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up 6063 6064 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 6065 6066 aese q2, v27.16b 6067 aesmc q2, q2 @ AES block 4k+6 - round 9 6068 ext q9, q9, q9, #8 @ MODULO - other top alignment 6069 6070 aese q3, v27.16b 6071 aesmc q3, q3 @ AES block 4k+7 - round 9 6072 6073 aese q0, v27.16b 6074 aesmc q0, q0 @ AES block 4k+4 - round 9 6075 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 6076 6077 aese q2, v28.16b 6078 aesmc q2, q2 @ AES block 4k+6 - round 10 6079 6080 aese q3, v28.16b 6081 aesmc q3, q3 @ AES block 4k+7 - round 10 6082 6083 aese q0, v28.16b 6084 aesmc q0, q0 @ AES block 4k+4 - round 10 6085 eor r22, r22, r14 @ AES block 4k+2 - round 14 high 6086 #ifdef __ARMEB__ 6087 rev r22, r22 6088 #endif 6089 aese q1, v28.16b 6090 aesmc q1, q1 @ AES block 4k+5 - round 10 6091 eor r23, r23, r13 @ AES block 4k+3 - round 14 low 6092 #ifdef __ARMEB__ 6093 rev r23, r23 6094 #endif 6095 aese q2, v29.16b 6096 aesmc q2, q2 @ AES block 4k+6 - round 11 6097 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 6098 6099 aese q0, v29.16b 6100 aesmc q0, q0 @ AES block 4k+4 - round 11 6101 add r12, r12, #1 @ CTR block 4k+7 6102 6103 aese q1, v29.16b 6104 aesmc q1, q1 @ AES block 4k+5 - round 11 6105 eor r21, r21, r13 @ AES block 4k+2 - round 14 low 6106 #ifdef __ARMEB__ 6107 rev r21, r21 6108 #endif 6109 6110 aese q2, v30.16b 6111 aesmc q2, q2 @ AES block 4k+6 - round 12 6112 6113 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 6114 eor r24, r24, r14 @ AES block 4k+3 - round 14 high 6115 #ifdef __ARMEB__ 6116 rev r24, r24 6117 #endif 6118 6119 aese q3, v29.16b 6120 aesmc q3, q3 @ AES block 4k+7 - round 11 6121 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result 6122 6123 aese q1, v30.16b 6124 aesmc q1, q1 @ AES block 4k+5 - round 12 6125 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 6126 6127 aese q0, v30.16b 6128 aesmc q0, q0 @ AES block 4k+4 - round 12 6129 stp r23, r24, [r2], #16 @ AES block 4k+3 - store result 6130 6131 aese q3, v30.16b 6132 aesmc q3, q3 @ AES block 4k+7 - round 12 6133 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 6134 6135 aese q1, v31.16b @ AES block 4k+5 - round 13 6136 6137 aese q0, v31.16b @ AES block 4k+4 - round 13 6138 6139 aese q3, v31.16b @ AES block 4k+7 - round 13 6140 6141 aese q2, v31.16b @ AES block 4k+6 - round 13 6142 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 6143 .L256_dec_tail:@ TAIL 6144 6145 sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process 6146 ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext 6147 6148 eor q0, q5, q0 @ AES block 4k+4 - result 6149 6150 mov r6, v0.d[0] @ AES block 4k+4 - mov low 6151 6152 mov r7, v0.d[1] @ AES block 4k+4 - mov high 6153 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag 6154 6155 cmp r5, #48 6156 6157 eor r6, r6, r13 @ AES block 4k+4 - round 14 low 6158 #ifdef __ARMEB__ 6159 rev r6, r6 6160 #endif 6161 6162 eor r7, r7, r14 @ AES block 4k+4 - round 14 high 6163 #ifdef __ARMEB__ 6164 rev r7, r7 6165 #endif 6166 bgt .L256_dec_blocks_more_than_3 6167 6168 sub r12, r12, #1 6169 mov q3, q2 6170 movi v10.8b, #0 6171 6172 movi v11.8b, #0 6173 cmp r5, #32 6174 6175 movi q9, #0 6176 mov q2, q1 6177 bgt .L256_dec_blocks_more_than_2 6178 6179 sub r12, r12, #1 6180 6181 mov q3, q1 6182 cmp r5, #16 6183 bgt .L256_dec_blocks_more_than_1 6184 6185 sub r12, r12, #1 6186 b .L256_dec_blocks_less_than_1 6187 .L256_dec_blocks_more_than_3:@ blocks left > 3 6188 rev64 q4, q5 @ GHASH final-3 block 6189 ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext 6190 6191 stp r6, r7, [r2], #16 @ AES final-3 block - store result 6192 6193 mov d10, v17.d[1] @ GHASH final-3 block - mid 6194 6195 eor q4, q4, q8 @ feed in partial tag 6196 6197 eor q0, q5, q1 @ AES final-2 block - result 6198 6199 mov d22, v4.d[1] @ GHASH final-3 block - mid 6200 6201 mov r6, v0.d[0] @ AES final-2 block - mov low 6202 6203 mov r7, v0.d[1] @ AES final-2 block - mov high 6204 6205 eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid 6206 6207 movi q8, #0 @ suppress further partial tag feed in 6208 6209 pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high 6210 6211 pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid 6212 eor r6, r6, r13 @ AES final-2 block - round 14 low 6213 #ifdef __ARMEB__ 6214 rev r6, r6 6215 #endif 6216 6217 pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low 6218 eor r7, r7, r14 @ AES final-2 block - round 14 high 6219 #ifdef __ARMEB__ 6220 rev r7, r7 6221 #endif 6222 .L256_dec_blocks_more_than_2:@ blocks left > 2 6223 6224 rev64 q4, q5 @ GHASH final-2 block 6225 ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext 6226 6227 eor q4, q4, q8 @ feed in partial tag 6228 stp r6, r7, [r2], #16 @ AES final-2 block - store result 6229 6230 eor q0, q5, q2 @ AES final-1 block - result 6231 6232 mov d22, v4.d[1] @ GHASH final-2 block - mid 6233 6234 pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low 6235 6236 pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high 6237 6238 eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid 6239 mov r6, v0.d[0] @ AES final-1 block - mov low 6240 6241 mov r7, v0.d[1] @ AES final-1 block - mov high 6242 eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low 6243 movi q8, #0 @ suppress further partial tag feed in 6244 6245 pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid 6246 6247 eor q9, q9, v20.16b @ GHASH final-2 block - high 6248 eor r6, r6, r13 @ AES final-1 block - round 14 low 6249 #ifdef __ARMEB__ 6250 rev r6, r6 6251 #endif 6252 6253 eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid 6254 eor r7, r7, r14 @ AES final-1 block - round 14 high 6255 #ifdef __ARMEB__ 6256 rev r7, r7 6257 #endif 6258 .L256_dec_blocks_more_than_1:@ blocks left > 1 6259 6260 stp r6, r7, [r2], #16 @ AES final-1 block - store result 6261 rev64 q4, q5 @ GHASH final-1 block 6262 6263 ld1 { q5}, [r0], #16 @ AES final block - load ciphertext 6264 6265 eor q4, q4, q8 @ feed in partial tag 6266 movi q8, #0 @ suppress further partial tag feed in 6267 6268 mov d22, v4.d[1] @ GHASH final-1 block - mid 6269 6270 eor q0, q5, q3 @ AES final block - result 6271 6272 pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high 6273 6274 eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid 6275 6276 pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low 6277 mov r6, v0.d[0] @ AES final block - mov low 6278 6279 ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid 6280 6281 mov r7, v0.d[1] @ AES final block - mov high 6282 6283 pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid 6284 eor r6, r6, r13 @ AES final block - round 14 low 6285 #ifdef __ARMEB__ 6286 rev r6, r6 6287 #endif 6288 eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low 6289 6290 eor q9, q9, v20.16b @ GHASH final-1 block - high 6291 6292 eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid 6293 eor r7, r7, r14 @ AES final block - round 14 high 6294 #ifdef __ARMEB__ 6295 rev r7, r7 6296 #endif 6297 .L256_dec_blocks_less_than_1:@ blocks left <= 1 6298 6299 and r1, r1, #127 @ bit_length %= 128 6300 mvn r14, xzr @ rk14_h = 0xffffffffffffffff 6301 6302 sub r1, r1, #128 @ bit_length -= 128 6303 mvn r13, xzr @ rk14_l = 0xffffffffffffffff 6304 6305 ldp r4, r5, [r2] @ load existing bytes we need to not overwrite 6306 neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128]) 6307 6308 and r1, r1, #127 @ bit_length %= 128 6309 6310 lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block 6311 cmp r1, #64 6312 6313 csel r9, r13, r14, lt 6314 csel r10, r14, xzr, lt 6315 6316 fmov d0, r9 @ ctr0b is mask for last block 6317 and r6, r6, r9 6318 6319 mov v0.d[1], r10 6320 bic r4, r4, r9 @ mask out low existing bytes 6321 6322 #ifndef __ARMEB__ 6323 rev r9, r12 6324 #else 6325 mov r9, r12 6326 #endif 6327 6328 bic r5, r5, r10 @ mask out high existing bytes 6329 6330 orr r6, r6, r4 6331 6332 and r7, r7, r10 6333 6334 orr r7, r7, r5 6335 6336 and q5, q5, q0 @ possibly partial last block has zeroes in highest bits 6337 6338 rev64 q4, q5 @ GHASH final block 6339 6340 eor q4, q4, q8 @ feed in partial tag 6341 6342 pmull v21.1q, q4, v12.1d @ GHASH final block - low 6343 6344 mov d8, v4.d[1] @ GHASH final block - mid 6345 6346 eor q8, q8, q4 @ GHASH final block - mid 6347 6348 pmull2 v20.1q, q4, v12.2d @ GHASH final block - high 6349 6350 pmull v8.1q, q8, v16.1d @ GHASH final block - mid 6351 6352 eor q9, q9, v20.16b @ GHASH final block - high 6353 6354 eor v11.16b, v11.16b, v21.16b @ GHASH final block - low 6355 6356 eor v10.16b, v10.16b, q8 @ GHASH final block - mid 6357 movi q8, #0xc2 6358 6359 eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up 6360 6361 shl d8, d8, #56 @ mod_constant 6362 6363 eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up 6364 6365 pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid 6366 6367 ext q9, q9, q9, #8 @ MODULO - other top alignment 6368 6369 eor v10.16b, v10.16b, q7 @ MODULO - fold into mid 6370 6371 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid 6372 6373 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low 6374 6375 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment 6376 6377 eor v11.16b, v11.16b, q8 @ MODULO - fold into low 6378 6379 stp r6, r7, [r2] 6380 6381 str r9, [r16, #12] @ store the updated counter 6382 6383 eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low 6384 ext v11.16b, v11.16b, v11.16b, #8 6385 rev64 v11.16b, v11.16b 6386 mov r0, r15 6387 st1 { v11.16b }, [r3] 6388 6389 ldp r21, r22, [sp, #16] 6390 ldp r23, r24, [sp, #32] 6391 ldp d8, d9, [sp, #48] 6392 ldp d10, d11, [sp, #64] 6393 ldp d12, d13, [sp, #80] 6394 ldp d14, d15, [sp, #96] 6395 ldp r19, r20, [sp], #112 6396 RET 6397 6398 .L256_dec_ret: 6399 mov r0, #0x0 6400 RET 6401 .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6402 .section .rodata 6403 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 6404 .align 2 6405 .align 2 6406 #endif 6407