1 #include "arm_arch.h" 2 3 #if __ARM_MAX_ARCH__>=8 4 .arch armv8-a+crypto 5 .text 6 .globl aes_gcm_enc_128_kernel 7 .type aes_gcm_enc_128_kernel,%function 8 .align 4 9 aes_gcm_enc_128_kernel: 10 AARCH64_VALID_CALL_TARGET 11 cbz x1, .L128_enc_ret 12 stp x19, x20, [sp, #-112]! 13 mov x16, x4 14 mov x8, x5 15 stp x21, x22, [sp, #16] 16 stp x23, x24, [sp, #32] 17 stp d8, d9, [sp, #48] 18 stp d10, d11, [sp, #64] 19 stp d12, d13, [sp, #80] 20 stp d14, d15, [sp, #96] 21 22 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 23 #ifdef __AARCH64EB__ 24 rev x10, x10 25 rev x11, x11 26 #endif 27 ldp x13, x14, [x8, #160] //load rk10 28 #ifdef __AARCH64EB__ 29 ror x13, x13, #32 30 ror x14, x14, #32 31 #endif 32 ld1 {v11.16b}, [x3] 33 ext v11.16b, v11.16b, v11.16b, #8 34 rev64 v11.16b, v11.16b 35 lsr x5, x1, #3 //byte_len 36 mov x15, x5 37 38 ld1 {v18.4s}, [x8], #16 //load rk0 39 add x4, x0, x1, lsr #3 //end_input_ptr 40 sub x5, x5, #1 //byte_len - 1 41 42 lsr x12, x11, #32 43 ldr q15, [x3, #112] //load h4l | h4h 44 #ifndef __AARCH64EB__ 45 ext v15.16b, v15.16b, v15.16b, #8 46 #endif 47 fmov d1, x10 //CTR block 1 48 rev w12, w12 //rev_ctr32 49 50 add w12, w12, #1 //increment rev_ctr32 51 orr w11, w11, w11 52 ld1 {v19.4s}, [x8], #16 //load rk1 53 54 rev w9, w12 //CTR block 1 55 add w12, w12, #1 //CTR block 1 56 fmov d3, x10 //CTR block 3 57 58 orr x9, x11, x9, lsl #32 //CTR block 1 59 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 60 61 fmov v1.d[1], x9 //CTR block 1 62 rev w9, w12 //CTR block 2 63 64 fmov d2, x10 //CTR block 2 65 orr x9, x11, x9, lsl #32 //CTR block 2 66 add w12, w12, #1 //CTR block 2 67 68 fmov v2.d[1], x9 //CTR block 2 69 rev w9, w12 //CTR block 3 70 71 orr x9, x11, x9, lsl #32 //CTR block 3 72 ld1 {v20.4s}, [x8], #16 //load rk2 73 74 add w12, w12, #1 //CTR block 3 75 fmov v3.d[1], x9 //CTR block 3 76 77 ldr q14, [x3, #80] //load h3l | h3h 78 #ifndef __AARCH64EB__ 79 ext v14.16b, v14.16b, v14.16b, #8 80 #endif 81 aese v1.16b, v18.16b 82 aesmc v1.16b, v1.16b //AES block 1 - round 0 83 ld1 {v21.4s}, [x8], #16 //load rk3 84 85 aese v2.16b, v18.16b 86 aesmc v2.16b, v2.16b //AES block 2 - round 0 87 ldr q12, [x3, #32] //load h1l | h1h 88 #ifndef __AARCH64EB__ 89 ext v12.16b, v12.16b, v12.16b, #8 90 #endif 91 92 aese v0.16b, v18.16b 93 aesmc v0.16b, v0.16b //AES block 0 - round 0 94 ld1 {v22.4s}, [x8], #16 //load rk4 95 96 aese v3.16b, v18.16b 97 aesmc v3.16b, v3.16b //AES block 3 - round 0 98 ld1 {v23.4s}, [x8], #16 //load rk5 99 100 aese v2.16b, v19.16b 101 aesmc v2.16b, v2.16b //AES block 2 - round 1 102 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 103 104 aese v0.16b, v19.16b 105 aesmc v0.16b, v0.16b //AES block 0 - round 1 106 ld1 {v24.4s}, [x8], #16 //load rk6 107 108 aese v1.16b, v19.16b 109 aesmc v1.16b, v1.16b //AES block 1 - round 1 110 ld1 {v25.4s}, [x8], #16 //load rk7 111 112 aese v3.16b, v19.16b 113 aesmc v3.16b, v3.16b //AES block 3 - round 1 114 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 115 116 aese v0.16b, v20.16b 117 aesmc v0.16b, v0.16b //AES block 0 - round 2 118 ld1 {v26.4s}, [x8], #16 //load rk8 119 120 aese v1.16b, v20.16b 121 aesmc v1.16b, v1.16b //AES block 1 - round 2 122 ldr q13, [x3, #64] //load h2l | h2h 123 #ifndef __AARCH64EB__ 124 ext v13.16b, v13.16b, v13.16b, #8 125 #endif 126 127 aese v3.16b, v20.16b 128 aesmc v3.16b, v3.16b //AES block 3 - round 2 129 130 aese v2.16b, v20.16b 131 aesmc v2.16b, v2.16b //AES block 2 - round 2 132 eor v17.16b, v17.16b, v9.16b //h4k | h3k 133 134 aese v0.16b, v21.16b 135 aesmc v0.16b, v0.16b //AES block 0 - round 3 136 137 aese v1.16b, v21.16b 138 aesmc v1.16b, v1.16b //AES block 1 - round 3 139 140 aese v2.16b, v21.16b 141 aesmc v2.16b, v2.16b //AES block 2 - round 3 142 ld1 {v27.4s}, [x8], #16 //load rk9 143 144 aese v3.16b, v21.16b 145 aesmc v3.16b, v3.16b //AES block 3 - round 3 146 147 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 148 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 149 150 aese v3.16b, v22.16b 151 aesmc v3.16b, v3.16b //AES block 3 - round 4 152 add x5, x5, x0 153 154 aese v2.16b, v22.16b 155 aesmc v2.16b, v2.16b //AES block 2 - round 4 156 cmp x0, x5 //check if we have <= 4 blocks 157 158 aese v0.16b, v22.16b 159 aesmc v0.16b, v0.16b //AES block 0 - round 4 160 161 aese v3.16b, v23.16b 162 aesmc v3.16b, v3.16b //AES block 3 - round 5 163 164 aese v2.16b, v23.16b 165 aesmc v2.16b, v2.16b //AES block 2 - round 5 166 167 aese v0.16b, v23.16b 168 aesmc v0.16b, v0.16b //AES block 0 - round 5 169 170 aese v3.16b, v24.16b 171 aesmc v3.16b, v3.16b //AES block 3 - round 6 172 173 aese v1.16b, v22.16b 174 aesmc v1.16b, v1.16b //AES block 1 - round 4 175 176 aese v2.16b, v24.16b 177 aesmc v2.16b, v2.16b //AES block 2 - round 6 178 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 179 180 aese v0.16b, v24.16b 181 aesmc v0.16b, v0.16b //AES block 0 - round 6 182 183 aese v1.16b, v23.16b 184 aesmc v1.16b, v1.16b //AES block 1 - round 5 185 186 aese v3.16b, v25.16b 187 aesmc v3.16b, v3.16b //AES block 3 - round 7 188 189 aese v0.16b, v25.16b 190 aesmc v0.16b, v0.16b //AES block 0 - round 7 191 192 aese v1.16b, v24.16b 193 aesmc v1.16b, v1.16b //AES block 1 - round 6 194 195 aese v2.16b, v25.16b 196 aesmc v2.16b, v2.16b //AES block 2 - round 7 197 198 aese v0.16b, v26.16b 199 aesmc v0.16b, v0.16b //AES block 0 - round 8 200 201 aese v1.16b, v25.16b 202 aesmc v1.16b, v1.16b //AES block 1 - round 7 203 204 aese v2.16b, v26.16b 205 aesmc v2.16b, v2.16b //AES block 2 - round 8 206 207 aese v3.16b, v26.16b 208 aesmc v3.16b, v3.16b //AES block 3 - round 8 209 210 aese v1.16b, v26.16b 211 aesmc v1.16b, v1.16b //AES block 1 - round 8 212 213 aese v2.16b, v27.16b //AES block 2 - round 9 214 215 aese v0.16b, v27.16b //AES block 0 - round 9 216 217 eor v16.16b, v16.16b, v8.16b //h2k | h1k 218 219 aese v1.16b, v27.16b //AES block 1 - round 9 220 221 aese v3.16b, v27.16b //AES block 3 - round 9 222 b.ge .L128_enc_tail //handle tail 223 224 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 225 #ifdef __AARCH64EB__ 226 rev x6, x6 227 rev x7, x7 228 #endif 229 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 230 #ifdef __AARCH64EB__ 231 rev x21, x21 232 rev x22, x22 233 #endif 234 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 235 #ifdef __AARCH64EB__ 236 rev x19, x19 237 rev x20, x20 238 #endif 239 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 240 #ifdef __AARCH64EB__ 241 rev x23, x23 242 rev x24, x24 243 #endif 244 eor x6, x6, x13 //AES block 0 - round 10 low 245 eor x7, x7, x14 //AES block 0 - round 10 high 246 247 eor x21, x21, x13 //AES block 2 - round 10 low 248 fmov d4, x6 //AES block 0 - mov low 249 250 eor x19, x19, x13 //AES block 1 - round 10 low 251 eor x22, x22, x14 //AES block 2 - round 10 high 252 fmov v4.d[1], x7 //AES block 0 - mov high 253 254 fmov d5, x19 //AES block 1 - mov low 255 eor x20, x20, x14 //AES block 1 - round 10 high 256 257 eor x23, x23, x13 //AES block 3 - round 10 low 258 fmov v5.d[1], x20 //AES block 1 - mov high 259 260 fmov d6, x21 //AES block 2 - mov low 261 eor x24, x24, x14 //AES block 3 - round 10 high 262 rev w9, w12 //CTR block 4 263 264 fmov v6.d[1], x22 //AES block 2 - mov high 265 orr x9, x11, x9, lsl #32 //CTR block 4 266 267 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 268 fmov d0, x10 //CTR block 4 269 add w12, w12, #1 //CTR block 4 270 271 fmov v0.d[1], x9 //CTR block 4 272 rev w9, w12 //CTR block 5 273 274 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 275 fmov d1, x10 //CTR block 5 276 orr x9, x11, x9, lsl #32 //CTR block 5 277 278 add w12, w12, #1 //CTR block 5 279 add x0, x0, #64 //AES input_ptr update 280 fmov v1.d[1], x9 //CTR block 5 281 282 fmov d7, x23 //AES block 3 - mov low 283 rev w9, w12 //CTR block 6 284 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 285 286 fmov v7.d[1], x24 //AES block 3 - mov high 287 orr x9, x11, x9, lsl #32 //CTR block 6 288 289 add w12, w12, #1 //CTR block 6 290 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 291 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 292 293 fmov d2, x10 //CTR block 6 294 cmp x0, x5 //check if we have <= 8 blocks 295 296 fmov v2.d[1], x9 //CTR block 6 297 rev w9, w12 //CTR block 7 298 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 299 300 orr x9, x11, x9, lsl #32 //CTR block 7 301 302 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 303 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 304 b.ge .L128_enc_prepretail //do prepretail 305 306 .L128_enc_main_loop: //main loop start 307 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 308 #ifdef __AARCH64EB__ 309 rev x23, x23 310 rev x24, x24 311 #endif 312 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 313 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 314 315 aese v2.16b, v18.16b 316 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 317 fmov d3, x10 //CTR block 4k+3 318 319 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 320 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 321 322 aese v1.16b, v18.16b 323 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 324 add w12, w12, #1 //CTR block 4k+3 325 fmov v3.d[1], x9 //CTR block 4k+3 326 327 aese v0.16b, v18.16b 328 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 329 mov d31, v6.d[1] //GHASH block 4k+2 - mid 330 331 aese v2.16b, v19.16b 332 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 333 mov d30, v5.d[1] //GHASH block 4k+1 - mid 334 335 aese v1.16b, v19.16b 336 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 337 eor v4.16b, v4.16b, v11.16b //PRE 1 338 339 aese v3.16b, v18.16b 340 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 341 eor x24, x24, x14 //AES block 4k+3 - round 10 high 342 343 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 344 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 345 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 346 #ifdef __AARCH64EB__ 347 rev x6, x6 348 rev x7, x7 349 #endif 350 aese v0.16b, v19.16b 351 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 352 rev w9, w12 //CTR block 4k+8 353 354 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 355 mov d8, v4.d[1] //GHASH block 4k - mid 356 orr x9, x11, x9, lsl #32 //CTR block 4k+8 357 358 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 359 add w12, w12, #1 //CTR block 4k+8 360 mov d10, v17.d[1] //GHASH block 4k - mid 361 362 aese v0.16b, v20.16b 363 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 364 365 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 366 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 367 368 aese v1.16b, v20.16b 369 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 370 371 aese v0.16b, v21.16b 372 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 373 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 374 375 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 376 377 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 378 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 379 380 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 381 382 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 383 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 384 385 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 386 eor x7, x7, x14 //AES block 4k+4 - round 10 high 387 388 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 389 mov d30, v7.d[1] //GHASH block 4k+3 - mid 390 391 aese v3.16b, v19.16b 392 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 393 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 394 395 aese v2.16b, v20.16b 396 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 397 eor x6, x6, x13 //AES block 4k+4 - round 10 low 398 399 aese v1.16b, v21.16b 400 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 401 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 402 403 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 404 405 aese v2.16b, v21.16b 406 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 407 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 408 409 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 410 411 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 412 movi v8.8b, #0xc2 413 414 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 415 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 416 417 aese v1.16b, v22.16b 418 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 419 420 aese v3.16b, v20.16b 421 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 422 shl d8, d8, #56 //mod_constant 423 424 aese v0.16b, v22.16b 425 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 426 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 427 428 aese v1.16b, v23.16b 429 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 430 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 431 #ifdef __AARCH64EB__ 432 rev x19, x19 433 rev x20, x20 434 #endif 435 aese v3.16b, v21.16b 436 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 437 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 438 439 aese v0.16b, v23.16b 440 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 441 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 442 #ifdef __AARCH64EB__ 443 rev x21, x21 444 rev x22, x22 445 #endif 446 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 447 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 448 449 aese v2.16b, v22.16b 450 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 451 eor x19, x19, x13 //AES block 4k+5 - round 10 low 452 453 aese v3.16b, v22.16b 454 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 455 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 456 457 aese v1.16b, v24.16b 458 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 459 eor x23, x23, x13 //AES block 4k+3 - round 10 low 460 461 aese v2.16b, v23.16b 462 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 463 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 464 465 fmov d4, x6 //AES block 4k+4 - mov low 466 aese v0.16b, v24.16b 467 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 468 fmov v4.d[1], x7 //AES block 4k+4 - mov high 469 470 add x0, x0, #64 //AES input_ptr update 471 fmov d7, x23 //AES block 4k+3 - mov low 472 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 473 474 aese v3.16b, v23.16b 475 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 476 fmov d5, x19 //AES block 4k+5 - mov low 477 478 aese v0.16b, v25.16b 479 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 480 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 481 482 aese v2.16b, v24.16b 483 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 484 eor x20, x20, x14 //AES block 4k+5 - round 10 high 485 486 aese v1.16b, v25.16b 487 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 488 fmov v5.d[1], x20 //AES block 4k+5 - mov high 489 490 aese v0.16b, v26.16b 491 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 492 fmov v7.d[1], x24 //AES block 4k+3 - mov high 493 494 aese v3.16b, v24.16b 495 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 496 cmp x0, x5 //.LOOP CONTROL 497 498 aese v1.16b, v26.16b 499 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 500 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 501 502 aese v0.16b, v27.16b //AES block 4k+4 - round 9 503 eor x21, x21, x13 //AES block 4k+6 - round 10 low 504 eor x22, x22, x14 //AES block 4k+6 - round 10 high 505 506 aese v3.16b, v25.16b 507 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 508 fmov d6, x21 //AES block 4k+6 - mov low 509 510 aese v1.16b, v27.16b //AES block 4k+5 - round 9 511 fmov v6.d[1], x22 //AES block 4k+6 - mov high 512 513 aese v2.16b, v25.16b 514 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 515 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 516 517 fmov d0, x10 //CTR block 4k+8 518 aese v3.16b, v26.16b 519 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 520 521 fmov v0.d[1], x9 //CTR block 4k+8 522 rev w9, w12 //CTR block 4k+9 523 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 524 525 aese v2.16b, v26.16b 526 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 527 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 528 529 add w12, w12, #1 //CTR block 4k+9 530 orr x9, x11, x9, lsl #32 //CTR block 4k+9 531 fmov d1, x10 //CTR block 4k+9 532 533 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 534 fmov v1.d[1], x9 //CTR block 4k+9 535 rev w9, w12 //CTR block 4k+10 536 537 aese v2.16b, v27.16b //AES block 4k+6 - round 9 538 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 539 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 540 orr x9, x11, x9, lsl #32 //CTR block 4k+10 541 542 aese v3.16b, v27.16b //AES block 4k+7 - round 9 543 add w12, w12, #1 //CTR block 4k+10 544 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 545 fmov d2, x10 //CTR block 4k+10 546 547 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 548 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 549 550 fmov v2.d[1], x9 //CTR block 4k+10 551 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 552 rev w9, w12 //CTR block 4k+11 553 554 orr x9, x11, x9, lsl #32 //CTR block 4k+11 555 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 556 557 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 558 st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 559 b.lt .L128_enc_main_loop 560 561 .L128_enc_prepretail: //PREPRETAIL 562 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 563 fmov d3, x10 //CTR block 4k+3 564 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 565 566 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 567 add w12, w12, #1 //CTR block 4k+3 568 fmov v3.d[1], x9 //CTR block 4k+3 569 570 aese v1.16b, v18.16b 571 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 572 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 573 574 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 575 576 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 577 eor v4.16b, v4.16b, v11.16b //PRE 1 578 579 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 580 581 aese v3.16b, v18.16b 582 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 583 mov d30, v5.d[1] //GHASH block 4k+1 - mid 584 585 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 586 mov d8, v4.d[1] //GHASH block 4k - mid 587 588 mov d31, v6.d[1] //GHASH block 4k+2 - mid 589 mov d10, v17.d[1] //GHASH block 4k - mid 590 591 aese v1.16b, v19.16b 592 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 593 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 594 595 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 596 597 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 598 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 599 600 aese v3.16b, v19.16b 601 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 602 603 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 604 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 605 606 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 607 608 aese v0.16b, v18.16b 609 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 610 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 611 612 aese v2.16b, v18.16b 613 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 614 615 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 616 mov d30, v7.d[1] //GHASH block 4k+3 - mid 617 618 aese v0.16b, v19.16b 619 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 620 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 621 622 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 623 624 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 625 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 626 627 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 628 629 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 630 631 aese v2.16b, v19.16b 632 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 633 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 634 635 aese v0.16b, v20.16b 636 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 637 638 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 639 movi v8.8b, #0xc2 640 641 aese v2.16b, v20.16b 642 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 643 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 644 645 aese v3.16b, v20.16b 646 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 647 648 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 649 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 650 651 aese v2.16b, v21.16b 652 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 653 654 aese v1.16b, v20.16b 655 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 656 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 657 658 aese v0.16b, v21.16b 659 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 660 661 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 662 shl d8, d8, #56 //mod_constant 663 664 aese v1.16b, v21.16b 665 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 666 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 667 668 aese v0.16b, v22.16b 669 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 670 671 pmull v28.1q, v9.1d, v8.1d 672 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 673 674 aese v1.16b, v22.16b 675 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 676 677 aese v0.16b, v23.16b 678 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 679 ext v9.16b, v9.16b, v9.16b, #8 680 681 aese v3.16b, v21.16b 682 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 683 684 aese v2.16b, v22.16b 685 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 686 eor v10.16b, v10.16b, v11.16b 687 688 aese v0.16b, v24.16b 689 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 690 691 aese v3.16b, v22.16b 692 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 693 694 aese v1.16b, v23.16b 695 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 696 697 aese v2.16b, v23.16b 698 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 699 eor v10.16b, v10.16b, v28.16b 700 701 aese v3.16b, v23.16b 702 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 703 704 aese v1.16b, v24.16b 705 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 706 707 aese v2.16b, v24.16b 708 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 709 710 aese v3.16b, v24.16b 711 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 712 eor v10.16b, v10.16b, v9.16b 713 714 aese v0.16b, v25.16b 715 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 716 717 aese v2.16b, v25.16b 718 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 719 720 aese v3.16b, v25.16b 721 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 722 723 pmull v28.1q, v10.1d, v8.1d 724 725 aese v1.16b, v25.16b 726 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 727 ext v10.16b, v10.16b, v10.16b, #8 728 729 aese v3.16b, v26.16b 730 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 731 732 aese v0.16b, v26.16b 733 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 734 eor v11.16b, v11.16b, v28.16b 735 736 aese v1.16b, v26.16b 737 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 738 739 aese v3.16b, v27.16b //AES block 4k+7 - round 9 740 741 aese v2.16b, v26.16b 742 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 743 744 aese v0.16b, v27.16b //AES block 4k+4 - round 9 745 746 aese v1.16b, v27.16b //AES block 4k+5 - round 9 747 eor v11.16b, v11.16b, v10.16b 748 749 aese v2.16b, v27.16b //AES block 4k+6 - round 9 750 .L128_enc_tail: //TAIL 751 752 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 753 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 754 #ifdef __AARCH64EB__ 755 rev x6, x6 756 rev x7, x7 757 #endif 758 cmp x5, #48 759 760 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 761 eor x6, x6, x13 //AES block 4k+4 - round 10 low 762 eor x7, x7, x14 //AES block 4k+4 - round 10 high 763 764 fmov d4, x6 //AES block 4k+4 - mov low 765 766 fmov v4.d[1], x7 //AES block 4k+4 - mov high 767 768 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 769 770 b.gt .L128_enc_blocks_more_than_3 771 772 sub w12, w12, #1 773 movi v11.8b, #0 774 mov v3.16b, v2.16b 775 776 cmp x5, #32 777 mov v2.16b, v1.16b 778 movi v9.8b, #0 779 780 movi v10.8b, #0 781 b.gt .L128_enc_blocks_more_than_2 782 783 mov v3.16b, v1.16b 784 cmp x5, #16 785 786 sub w12, w12, #1 787 b.gt .L128_enc_blocks_more_than_1 788 789 sub w12, w12, #1 790 b .L128_enc_blocks_less_than_1 791 .L128_enc_blocks_more_than_3: //blocks left > 3 792 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 793 794 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 795 #ifdef __AARCH64EB__ 796 rev x6, x6 797 rev x7, x7 798 #endif 799 rev64 v4.16b, v5.16b //GHASH final-3 block 800 801 eor v4.16b, v4.16b, v8.16b //feed in partial tag 802 eor x7, x7, x14 //AES final-2 block - round 10 high 803 eor x6, x6, x13 //AES final-2 block - round 10 low 804 805 fmov d5, x6 //AES final-2 block - mov low 806 807 movi v8.8b, #0 //suppress further partial tag feed in 808 fmov v5.d[1], x7 //AES final-2 block - mov high 809 810 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 811 mov d22, v4.d[1] //GHASH final-3 block - mid 812 813 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 814 815 mov d10, v17.d[1] //GHASH final-3 block - mid 816 817 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 818 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 819 820 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 821 .L128_enc_blocks_more_than_2: //blocks left > 2 822 823 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 824 825 rev64 v4.16b, v5.16b //GHASH final-2 block 826 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 827 #ifdef __AARCH64EB__ 828 rev x6, x6 829 rev x7, x7 830 #endif 831 eor v4.16b, v4.16b, v8.16b //feed in partial tag 832 833 eor x6, x6, x13 //AES final-1 block - round 10 low 834 835 fmov d5, x6 //AES final-1 block - mov low 836 eor x7, x7, x14 //AES final-1 block - round 10 high 837 838 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 839 fmov v5.d[1], x7 //AES final-1 block - mov high 840 841 mov d22, v4.d[1] //GHASH final-2 block - mid 842 843 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 844 845 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 846 847 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 848 849 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 850 851 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 852 853 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 854 855 movi v8.8b, #0 //suppress further partial tag feed in 856 857 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 858 .L128_enc_blocks_more_than_1: //blocks left > 1 859 860 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 861 862 rev64 v4.16b, v5.16b //GHASH final-1 block 863 ldp x6, x7, [x0], #16 //AES final block - load input low & high 864 #ifdef __AARCH64EB__ 865 rev x6, x6 866 rev x7, x7 867 #endif 868 eor v4.16b, v4.16b, v8.16b //feed in partial tag 869 870 eor x7, x7, x14 //AES final block - round 10 high 871 eor x6, x6, x13 //AES final block - round 10 low 872 873 fmov d5, x6 //AES final block - mov low 874 875 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 876 fmov v5.d[1], x7 //AES final block - mov high 877 878 mov d22, v4.d[1] //GHASH final-1 block - mid 879 880 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 881 882 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 883 884 eor v5.16b, v5.16b, v3.16b //AES final block - result 885 886 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 887 888 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 889 890 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 891 892 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 893 894 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 895 movi v8.8b, #0 //suppress further partial tag feed in 896 .L128_enc_blocks_less_than_1: //blocks left <= 1 897 898 and x1, x1, #127 //bit_length %= 128 899 mvn x13, xzr //rk10_l = 0xffffffffffffffff 900 901 mvn x14, xzr //rk10_h = 0xffffffffffffffff 902 sub x1, x1, #128 //bit_length -= 128 903 904 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 905 906 and x1, x1, #127 //bit_length %= 128 907 908 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 909 cmp x1, #64 910 911 csel x6, x13, x14, lt 912 csel x7, x14, xzr, lt 913 914 fmov d0, x6 //ctr0b is mask for last block 915 916 fmov v0.d[1], x7 917 918 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 919 920 rev64 v4.16b, v5.16b //GHASH final block 921 922 eor v4.16b, v4.16b, v8.16b //feed in partial tag 923 924 mov d8, v4.d[1] //GHASH final block - mid 925 926 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 927 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 928 929 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 930 #ifndef __AARCH64EB__ 931 rev w9, w12 932 #else 933 mov w9, w12 934 #endif 935 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 936 937 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 938 939 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 940 941 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 942 943 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 944 movi v8.8b, #0xc2 945 946 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 947 948 shl d8, d8, #56 //mod_constant 949 950 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 951 952 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 953 954 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 955 956 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 957 958 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 959 960 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 961 962 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 963 964 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 965 966 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 967 st1 { v5.16b}, [x2] //store all 16B 968 969 str w9, [x16, #12] //store the updated counter 970 971 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 972 ext v11.16b, v11.16b, v11.16b, #8 973 rev64 v11.16b, v11.16b 974 mov x0, x15 975 st1 { v11.16b }, [x3] 976 ldp x21, x22, [sp, #16] 977 ldp x23, x24, [sp, #32] 978 ldp d8, d9, [sp, #48] 979 ldp d10, d11, [sp, #64] 980 ldp d12, d13, [sp, #80] 981 ldp d14, d15, [sp, #96] 982 ldp x19, x20, [sp], #112 983 ret 984 985 .L128_enc_ret: 986 mov w0, #0x0 987 ret 988 .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 989 .globl aes_gcm_dec_128_kernel 990 .type aes_gcm_dec_128_kernel,%function 991 .align 4 992 aes_gcm_dec_128_kernel: 993 AARCH64_VALID_CALL_TARGET 994 cbz x1, .L128_dec_ret 995 stp x19, x20, [sp, #-112]! 996 mov x16, x4 997 mov x8, x5 998 stp x21, x22, [sp, #16] 999 stp x23, x24, [sp, #32] 1000 stp d8, d9, [sp, #48] 1001 stp d10, d11, [sp, #64] 1002 stp d12, d13, [sp, #80] 1003 stp d14, d15, [sp, #96] 1004 1005 lsr x5, x1, #3 //byte_len 1006 mov x15, x5 1007 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1008 #ifdef __AARCH64EB__ 1009 rev x10, x10 1010 rev x11, x11 1011 #endif 1012 ldp x13, x14, [x8, #160] //load rk10 1013 #ifdef __AARCH64EB__ 1014 ror x14, x14, 32 1015 ror x13, x13, 32 1016 #endif 1017 sub x5, x5, #1 //byte_len - 1 1018 ld1 {v18.4s}, [x8], #16 //load rk0 1019 1020 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1021 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 1022 1023 ldr q13, [x3, #64] //load h2l | h2h 1024 #ifndef __AARCH64EB__ 1025 ext v13.16b, v13.16b, v13.16b, #8 1026 #endif 1027 lsr x12, x11, #32 1028 fmov d2, x10 //CTR block 2 1029 1030 ld1 {v19.4s}, [x8], #16 //load rk1 1031 orr w11, w11, w11 1032 rev w12, w12 //rev_ctr32 1033 1034 fmov d1, x10 //CTR block 1 1035 add w12, w12, #1 //increment rev_ctr32 1036 1037 aese v0.16b, v18.16b 1038 aesmc v0.16b, v0.16b //AES block 0 - round 0 1039 rev w9, w12 //CTR block 1 1040 1041 orr x9, x11, x9, lsl #32 //CTR block 1 1042 ld1 {v20.4s}, [x8], #16 //load rk2 1043 add w12, w12, #1 //CTR block 1 1044 1045 fmov v1.d[1], x9 //CTR block 1 1046 rev w9, w12 //CTR block 2 1047 add w12, w12, #1 //CTR block 2 1048 1049 aese v0.16b, v19.16b 1050 aesmc v0.16b, v0.16b //AES block 0 - round 1 1051 orr x9, x11, x9, lsl #32 //CTR block 2 1052 1053 fmov v2.d[1], x9 //CTR block 2 1054 rev w9, w12 //CTR block 3 1055 1056 fmov d3, x10 //CTR block 3 1057 orr x9, x11, x9, lsl #32 //CTR block 3 1058 add w12, w12, #1 //CTR block 3 1059 1060 fmov v3.d[1], x9 //CTR block 3 1061 add x4, x0, x1, lsr #3 //end_input_ptr 1062 1063 aese v1.16b, v18.16b 1064 aesmc v1.16b, v1.16b //AES block 1 - round 0 1065 ld1 {v21.4s}, [x8], #16 //load rk3 1066 1067 aese v0.16b, v20.16b 1068 aesmc v0.16b, v0.16b //AES block 0 - round 2 1069 ld1 {v22.4s}, [x8], #16 //load rk4 1070 1071 aese v2.16b, v18.16b 1072 aesmc v2.16b, v2.16b //AES block 2 - round 0 1073 ld1 {v23.4s}, [x8], #16 //load rk5 1074 1075 aese v1.16b, v19.16b 1076 aesmc v1.16b, v1.16b //AES block 1 - round 1 1077 ld1 {v24.4s}, [x8], #16 //load rk6 1078 1079 aese v3.16b, v18.16b 1080 aesmc v3.16b, v3.16b //AES block 3 - round 0 1081 1082 aese v2.16b, v19.16b 1083 aesmc v2.16b, v2.16b //AES block 2 - round 1 1084 1085 aese v1.16b, v20.16b 1086 aesmc v1.16b, v1.16b //AES block 1 - round 2 1087 1088 aese v3.16b, v19.16b 1089 aesmc v3.16b, v3.16b //AES block 3 - round 1 1090 ld1 { v11.16b}, [x3] 1091 ext v11.16b, v11.16b, v11.16b, #8 1092 rev64 v11.16b, v11.16b 1093 1094 aese v0.16b, v21.16b 1095 aesmc v0.16b, v0.16b //AES block 0 - round 3 1096 ld1 {v25.4s}, [x8], #16 //load rk7 1097 1098 aese v1.16b, v21.16b 1099 aesmc v1.16b, v1.16b //AES block 1 - round 3 1100 1101 aese v3.16b, v20.16b 1102 aesmc v3.16b, v3.16b //AES block 3 - round 2 1103 1104 aese v2.16b, v20.16b 1105 aesmc v2.16b, v2.16b //AES block 2 - round 2 1106 ld1 {v26.4s}, [x8], #16 //load rk8 1107 1108 aese v1.16b, v22.16b 1109 aesmc v1.16b, v1.16b //AES block 1 - round 4 1110 1111 aese v3.16b, v21.16b 1112 aesmc v3.16b, v3.16b //AES block 3 - round 3 1113 1114 aese v2.16b, v21.16b 1115 aesmc v2.16b, v2.16b //AES block 2 - round 3 1116 ldr q14, [x3, #80] //load h3l | h3h 1117 #ifndef __AARCH64EB__ 1118 ext v14.16b, v14.16b, v14.16b, #8 1119 #endif 1120 aese v0.16b, v22.16b 1121 aesmc v0.16b, v0.16b //AES block 0 - round 4 1122 ld1 {v27.4s}, [x8], #16 //load rk9 1123 1124 aese v1.16b, v23.16b 1125 aesmc v1.16b, v1.16b //AES block 1 - round 5 1126 1127 aese v2.16b, v22.16b 1128 aesmc v2.16b, v2.16b //AES block 2 - round 4 1129 1130 aese v3.16b, v22.16b 1131 aesmc v3.16b, v3.16b //AES block 3 - round 4 1132 1133 aese v0.16b, v23.16b 1134 aesmc v0.16b, v0.16b //AES block 0 - round 5 1135 1136 aese v2.16b, v23.16b 1137 aesmc v2.16b, v2.16b //AES block 2 - round 5 1138 ldr q12, [x3, #32] //load h1l | h1h 1139 #ifndef __AARCH64EB__ 1140 ext v12.16b, v12.16b, v12.16b, #8 1141 #endif 1142 aese v3.16b, v23.16b 1143 aesmc v3.16b, v3.16b //AES block 3 - round 5 1144 1145 aese v0.16b, v24.16b 1146 aesmc v0.16b, v0.16b //AES block 0 - round 6 1147 1148 aese v1.16b, v24.16b 1149 aesmc v1.16b, v1.16b //AES block 1 - round 6 1150 1151 aese v3.16b, v24.16b 1152 aesmc v3.16b, v3.16b //AES block 3 - round 6 1153 1154 aese v2.16b, v24.16b 1155 aesmc v2.16b, v2.16b //AES block 2 - round 6 1156 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 1157 1158 ldr q15, [x3, #112] //load h4l | h4h 1159 #ifndef __AARCH64EB__ 1160 ext v15.16b, v15.16b, v15.16b, #8 1161 #endif 1162 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 1163 add x5, x5, x0 1164 1165 aese v1.16b, v25.16b 1166 aesmc v1.16b, v1.16b //AES block 1 - round 7 1167 1168 aese v2.16b, v25.16b 1169 aesmc v2.16b, v2.16b //AES block 2 - round 7 1170 1171 aese v0.16b, v25.16b 1172 aesmc v0.16b, v0.16b //AES block 0 - round 7 1173 eor v16.16b, v16.16b, v8.16b //h2k | h1k 1174 1175 aese v3.16b, v25.16b 1176 aesmc v3.16b, v3.16b //AES block 3 - round 7 1177 1178 aese v1.16b, v26.16b 1179 aesmc v1.16b, v1.16b //AES block 1 - round 8 1180 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 1181 1182 aese v2.16b, v26.16b 1183 aesmc v2.16b, v2.16b //AES block 2 - round 8 1184 1185 aese v3.16b, v26.16b 1186 aesmc v3.16b, v3.16b //AES block 3 - round 8 1187 1188 aese v0.16b, v26.16b 1189 aesmc v0.16b, v0.16b //AES block 0 - round 8 1190 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 1191 1192 aese v2.16b, v27.16b //AES block 2 - round 9 1193 1194 aese v3.16b, v27.16b //AES block 3 - round 9 1195 1196 aese v0.16b, v27.16b //AES block 0 - round 9 1197 cmp x0, x5 //check if we have <= 4 blocks 1198 1199 aese v1.16b, v27.16b //AES block 1 - round 9 1200 eor v17.16b, v17.16b, v9.16b //h4k | h3k 1201 b.ge .L128_dec_tail //handle tail 1202 1203 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext 1204 1205 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 1206 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 1207 1208 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 1209 rev64 v4.16b, v4.16b //GHASH block 0 1210 rev w9, w12 //CTR block 4 1211 1212 orr x9, x11, x9, lsl #32 //CTR block 4 1213 add w12, w12, #1 //CTR block 4 1214 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 1215 1216 rev64 v5.16b, v5.16b //GHASH block 1 1217 mov x19, v1.d[0] //AES block 1 - mov low 1218 1219 mov x20, v1.d[1] //AES block 1 - mov high 1220 1221 mov x6, v0.d[0] //AES block 0 - mov low 1222 cmp x0, x5 //check if we have <= 8 blocks 1223 1224 mov x7, v0.d[1] //AES block 0 - mov high 1225 1226 fmov d0, x10 //CTR block 4 1227 1228 fmov v0.d[1], x9 //CTR block 4 1229 rev w9, w12 //CTR block 5 1230 eor x19, x19, x13 //AES block 1 - round 10 low 1231 #ifdef __AARCH64EB__ 1232 rev x19, x19 1233 #endif 1234 fmov d1, x10 //CTR block 5 1235 add w12, w12, #1 //CTR block 5 1236 orr x9, x11, x9, lsl #32 //CTR block 5 1237 1238 fmov v1.d[1], x9 //CTR block 5 1239 rev w9, w12 //CTR block 6 1240 add w12, w12, #1 //CTR block 6 1241 1242 orr x9, x11, x9, lsl #32 //CTR block 6 1243 1244 eor x20, x20, x14 //AES block 1 - round 10 high 1245 #ifdef __AARCH64EB__ 1246 rev x20, x20 1247 #endif 1248 eor x6, x6, x13 //AES block 0 - round 10 low 1249 #ifdef __AARCH64EB__ 1250 rev x6, x6 1251 #endif 1252 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 1253 1254 eor x7, x7, x14 //AES block 0 - round 10 high 1255 #ifdef __AARCH64EB__ 1256 rev x7, x7 1257 #endif 1258 stp x6, x7, [x2], #16 //AES block 0 - store result 1259 1260 stp x19, x20, [x2], #16 //AES block 1 - store result 1261 b.ge .L128_dec_prepretail //do prepretail 1262 1263 .L128_dec_main_loop: //main loop start 1264 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1265 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1266 mov x21, v2.d[0] //AES block 4k+2 - mov low 1267 1268 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1269 mov x22, v2.d[1] //AES block 4k+2 - mov high 1270 1271 aese v1.16b, v18.16b 1272 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1273 fmov d2, x10 //CTR block 4k+6 1274 1275 rev64 v6.16b, v6.16b //GHASH block 4k+2 1276 fmov v2.d[1], x9 //CTR block 4k+6 1277 rev w9, w12 //CTR block 4k+7 1278 1279 mov x23, v3.d[0] //AES block 4k+3 - mov low 1280 eor v4.16b, v4.16b, v11.16b //PRE 1 1281 mov d30, v5.d[1] //GHASH block 4k+1 - mid 1282 1283 aese v1.16b, v19.16b 1284 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1285 rev64 v7.16b, v7.16b //GHASH block 4k+3 1286 1287 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1288 mov x24, v3.d[1] //AES block 4k+3 - mov high 1289 orr x9, x11, x9, lsl #32 //CTR block 4k+7 1290 1291 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1292 fmov d3, x10 //CTR block 4k+7 1293 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1294 1295 aese v1.16b, v20.16b 1296 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1297 fmov v3.d[1], x9 //CTR block 4k+7 1298 1299 aese v2.16b, v18.16b 1300 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1301 mov d10, v17.d[1] //GHASH block 4k - mid 1302 1303 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1304 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1305 1306 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1307 1308 aese v1.16b, v21.16b 1309 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1310 mov d8, v4.d[1] //GHASH block 4k - mid 1311 1312 aese v3.16b, v18.16b 1313 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1314 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1315 1316 aese v0.16b, v18.16b 1317 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1318 1319 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1320 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1321 1322 aese v3.16b, v19.16b 1323 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1324 eor x23, x23, x13 //AES block 4k+3 - round 10 low 1325 #ifdef __AARCH64EB__ 1326 rev x23, x23 1327 #endif 1328 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1329 eor x22, x22, x14 //AES block 4k+2 - round 10 high 1330 #ifdef __AARCH64EB__ 1331 rev x22, x22 1332 #endif 1333 mov d31, v6.d[1] //GHASH block 4k+2 - mid 1334 1335 aese v0.16b, v19.16b 1336 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1337 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1338 1339 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1340 1341 aese v3.16b, v20.16b 1342 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1343 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1344 1345 aese v0.16b, v20.16b 1346 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1347 1348 aese v1.16b, v22.16b 1349 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1350 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1351 1352 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1353 1354 aese v0.16b, v21.16b 1355 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1356 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1357 1358 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1359 1360 aese v2.16b, v19.16b 1361 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1362 mov d30, v7.d[1] //GHASH block 4k+3 - mid 1363 1364 aese v0.16b, v22.16b 1365 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1366 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1367 1368 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1369 eor x24, x24, x14 //AES block 4k+3 - round 10 high 1370 #ifdef __AARCH64EB__ 1371 rev x24, x24 1372 #endif 1373 aese v2.16b, v20.16b 1374 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1375 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1376 1377 aese v1.16b, v23.16b 1378 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1379 eor x21, x21, x13 //AES block 4k+2 - round 10 low 1380 #ifdef __AARCH64EB__ 1381 rev x21, x21 1382 #endif 1383 aese v0.16b, v23.16b 1384 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1385 movi v8.8b, #0xc2 1386 1387 aese v2.16b, v21.16b 1388 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1389 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1390 1391 aese v1.16b, v24.16b 1392 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1393 1394 aese v0.16b, v24.16b 1395 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1396 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1397 1398 aese v2.16b, v22.16b 1399 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1400 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1401 1402 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1403 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1404 ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext 1405 1406 aese v1.16b, v25.16b 1407 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1408 add w12, w12, #1 //CTR block 4k+7 1409 1410 aese v0.16b, v25.16b 1411 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1412 shl d8, d8, #56 //mod_constant 1413 1414 aese v2.16b, v23.16b 1415 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1416 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1417 1418 aese v1.16b, v26.16b 1419 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1420 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1421 1422 aese v0.16b, v26.16b 1423 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1424 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1425 1426 aese v3.16b, v21.16b 1427 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1428 rev w9, w12 //CTR block 4k+8 1429 1430 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1431 ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1432 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1433 1434 aese v0.16b, v27.16b //AES block 4k+4 - round 9 1435 orr x9, x11, x9, lsl #32 //CTR block 4k+8 1436 1437 aese v3.16b, v22.16b 1438 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1439 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1440 1441 aese v1.16b, v27.16b //AES block 4k+5 - round 9 1442 1443 aese v2.16b, v24.16b 1444 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1445 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 1446 1447 aese v3.16b, v23.16b 1448 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1449 ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 1450 1451 add w12, w12, #1 //CTR block 4k+8 1452 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1453 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 1454 1455 aese v2.16b, v25.16b 1456 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1457 ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 1458 1459 aese v3.16b, v24.16b 1460 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1461 1462 rev64 v5.16b, v5.16b //GHASH block 4k+5 1463 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1464 mov x7, v0.d[1] //AES block 4k+4 - mov high 1465 1466 aese v2.16b, v26.16b 1467 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1468 mov x6, v0.d[0] //AES block 4k+4 - mov low 1469 1470 aese v3.16b, v25.16b 1471 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1472 fmov d0, x10 //CTR block 4k+8 1473 1474 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1475 fmov v0.d[1], x9 //CTR block 4k+8 1476 rev w9, w12 //CTR block 4k+9 1477 1478 aese v2.16b, v27.16b //AES block 4k+6 - round 9 1479 orr x9, x11, x9, lsl #32 //CTR block 4k+9 1480 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1481 1482 aese v3.16b, v26.16b 1483 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1484 eor x7, x7, x14 //AES block 4k+4 - round 10 high 1485 #ifdef __AARCH64EB__ 1486 rev x7, x7 1487 #endif 1488 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1489 mov x20, v1.d[1] //AES block 4k+5 - mov high 1490 eor x6, x6, x13 //AES block 4k+4 - round 10 low 1491 #ifdef __AARCH64EB__ 1492 rev x6, x6 1493 #endif 1494 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 1495 mov x19, v1.d[0] //AES block 4k+5 - mov low 1496 add w12, w12, #1 //CTR block 4k+9 1497 1498 aese v3.16b, v27.16b //AES block 4k+7 - round 9 1499 fmov d1, x10 //CTR block 4k+9 1500 cmp x0, x5 //.LOOP CONTROL 1501 1502 rev64 v4.16b, v4.16b //GHASH block 4k+4 1503 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1504 fmov v1.d[1], x9 //CTR block 4k+9 1505 1506 rev w9, w12 //CTR block 4k+10 1507 add w12, w12, #1 //CTR block 4k+10 1508 1509 eor x20, x20, x14 //AES block 4k+5 - round 10 high 1510 #ifdef __AARCH64EB__ 1511 rev x20, x20 1512 #endif 1513 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 1514 1515 eor x19, x19, x13 //AES block 4k+5 - round 10 low 1516 #ifdef __AARCH64EB__ 1517 rev x19, x19 1518 #endif 1519 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 1520 1521 orr x9, x11, x9, lsl #32 //CTR block 4k+10 1522 b.lt .L128_dec_main_loop 1523 1524 .L128_dec_prepretail: //PREPRETAIL 1525 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1526 mov x21, v2.d[0] //AES block 4k+2 - mov low 1527 mov d30, v5.d[1] //GHASH block 4k+1 - mid 1528 1529 aese v0.16b, v18.16b 1530 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1531 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1532 1533 aese v1.16b, v18.16b 1534 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1535 mov x22, v2.d[1] //AES block 4k+2 - mov high 1536 1537 eor v4.16b, v4.16b, v11.16b //PRE 1 1538 fmov d2, x10 //CTR block 4k+6 1539 rev64 v6.16b, v6.16b //GHASH block 4k+2 1540 1541 aese v0.16b, v19.16b 1542 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1543 fmov v2.d[1], x9 //CTR block 4k+6 1544 1545 rev w9, w12 //CTR block 4k+7 1546 mov x23, v3.d[0] //AES block 4k+3 - mov low 1547 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1548 1549 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1550 mov d10, v17.d[1] //GHASH block 4k - mid 1551 mov x24, v3.d[1] //AES block 4k+3 - mov high 1552 1553 aese v1.16b, v19.16b 1554 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1555 mov d31, v6.d[1] //GHASH block 4k+2 - mid 1556 1557 aese v0.16b, v20.16b 1558 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1559 orr x9, x11, x9, lsl #32 //CTR block 4k+7 1560 1561 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1562 mov d8, v4.d[1] //GHASH block 4k - mid 1563 fmov d3, x10 //CTR block 4k+7 1564 1565 aese v2.16b, v18.16b 1566 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1567 fmov v3.d[1], x9 //CTR block 4k+7 1568 1569 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1570 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1571 1572 rev64 v7.16b, v7.16b //GHASH block 4k+3 1573 1574 aese v2.16b, v19.16b 1575 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1576 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1577 1578 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1579 1580 aese v3.16b, v18.16b 1581 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1582 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1583 1584 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1585 1586 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1587 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1588 1589 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1590 1591 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1592 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1593 1594 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1595 1596 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1597 1598 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1599 mov d30, v7.d[1] //GHASH block 4k+3 - mid 1600 1601 aese v1.16b, v20.16b 1602 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1603 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1604 1605 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1606 1607 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1608 movi v8.8b, #0xc2 1609 1610 aese v3.16b, v19.16b 1611 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1612 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1613 1614 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1615 1616 aese v2.16b, v20.16b 1617 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1618 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1619 1620 aese v3.16b, v20.16b 1621 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1622 eor x23, x23, x13 //AES block 4k+3 - round 10 low 1623 #ifdef __AARCH64EB__ 1624 rev x23, x23 1625 #endif 1626 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1627 eor x21, x21, x13 //AES block 4k+2 - round 10 low 1628 #ifdef __AARCH64EB__ 1629 rev x21, x21 1630 #endif 1631 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1632 1633 aese v2.16b, v21.16b 1634 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1635 1636 aese v1.16b, v21.16b 1637 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1638 shl d8, d8, #56 //mod_constant 1639 1640 aese v0.16b, v21.16b 1641 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1642 1643 aese v2.16b, v22.16b 1644 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1645 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1646 1647 aese v1.16b, v22.16b 1648 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1649 1650 aese v3.16b, v21.16b 1651 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1652 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1653 1654 aese v2.16b, v23.16b 1655 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1656 1657 aese v1.16b, v23.16b 1658 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1659 1660 aese v3.16b, v22.16b 1661 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1662 1663 aese v0.16b, v22.16b 1664 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1665 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1666 1667 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1668 1669 aese v1.16b, v24.16b 1670 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1671 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1672 1673 aese v3.16b, v23.16b 1674 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1675 1676 aese v0.16b, v23.16b 1677 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1678 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1679 1680 aese v1.16b, v25.16b 1681 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1682 1683 aese v2.16b, v24.16b 1684 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1685 1686 aese v0.16b, v24.16b 1687 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1688 1689 aese v1.16b, v26.16b 1690 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1691 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1692 1693 aese v3.16b, v24.16b 1694 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1695 1696 aese v0.16b, v25.16b 1697 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1698 1699 aese v1.16b, v27.16b //AES block 4k+5 - round 9 1700 1701 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1702 eor x24, x24, x14 //AES block 4k+3 - round 10 high 1703 #ifdef __AARCH64EB__ 1704 rev x24, x24 1705 #endif 1706 aese v2.16b, v25.16b 1707 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1708 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1709 1710 aese v3.16b, v25.16b 1711 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1712 1713 aese v0.16b, v26.16b 1714 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1715 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1716 1717 aese v2.16b, v26.16b 1718 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1719 1720 aese v3.16b, v26.16b 1721 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1722 eor x22, x22, x14 //AES block 4k+2 - round 10 high 1723 #ifdef __AARCH64EB__ 1724 rev x22, x22 1725 #endif 1726 aese v0.16b, v27.16b //AES block 4k+4 - round 9 1727 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1728 1729 aese v2.16b, v27.16b //AES block 4k+6 - round 9 1730 add w12, w12, #1 //CTR block 4k+7 1731 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1732 1733 aese v3.16b, v27.16b //AES block 4k+7 - round 9 1734 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1735 .L128_dec_tail: //TAIL 1736 1737 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 1738 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1739 1740 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 1741 1742 mov x7, v0.d[1] //AES block 4k+4 - mov high 1743 1744 mov x6, v0.d[0] //AES block 4k+4 - mov low 1745 1746 cmp x5, #48 1747 1748 eor x7, x7, x14 //AES block 4k+4 - round 10 high 1749 #ifdef __AARCH64EB__ 1750 rev x7, x7 1751 #endif 1752 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 1753 eor x6, x6, x13 //AES block 4k+4 - round 10 low 1754 #ifdef __AARCH64EB__ 1755 rev x6, x6 1756 #endif 1757 b.gt .L128_dec_blocks_more_than_3 1758 1759 mov v3.16b, v2.16b 1760 sub w12, w12, #1 1761 movi v11.8b, #0 1762 1763 movi v9.8b, #0 1764 mov v2.16b, v1.16b 1765 1766 movi v10.8b, #0 1767 cmp x5, #32 1768 b.gt .L128_dec_blocks_more_than_2 1769 1770 cmp x5, #16 1771 1772 mov v3.16b, v1.16b 1773 sub w12, w12, #1 1774 b.gt .L128_dec_blocks_more_than_1 1775 1776 sub w12, w12, #1 1777 b .L128_dec_blocks_less_than_1 1778 .L128_dec_blocks_more_than_3: //blocks left > 3 1779 rev64 v4.16b, v5.16b //GHASH final-3 block 1780 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 1781 1782 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1783 1784 mov d10, v17.d[1] //GHASH final-3 block - mid 1785 stp x6, x7, [x2], #16 //AES final-3 block - store result 1786 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 1787 1788 mov d22, v4.d[1] //GHASH final-3 block - mid 1789 mov x7, v0.d[1] //AES final-2 block - mov high 1790 1791 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 1792 mov x6, v0.d[0] //AES final-2 block - mov low 1793 1794 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 1795 1796 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 1797 1798 movi v8.8b, #0 //suppress further partial tag feed in 1799 eor x7, x7, x14 //AES final-2 block - round 10 high 1800 #ifdef __AARCH64EB__ 1801 rev x7, x7 1802 #endif 1803 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 1804 eor x6, x6, x13 //AES final-2 block - round 10 low 1805 #ifdef __AARCH64EB__ 1806 rev x6, x6 1807 #endif 1808 .L128_dec_blocks_more_than_2: //blocks left > 2 1809 1810 rev64 v4.16b, v5.16b //GHASH final-2 block 1811 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 1812 1813 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1814 1815 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 1816 stp x6, x7, [x2], #16 //AES final-2 block - store result 1817 1818 mov d22, v4.d[1] //GHASH final-2 block - mid 1819 1820 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 1821 1822 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 1823 mov x6, v0.d[0] //AES final-1 block - mov low 1824 1825 mov x7, v0.d[1] //AES final-1 block - mov high 1826 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 1827 1828 movi v8.8b, #0 //suppress further partial tag feed in 1829 1830 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 1831 1832 eor x6, x6, x13 //AES final-1 block - round 10 low 1833 #ifdef __AARCH64EB__ 1834 rev x6, x6 1835 #endif 1836 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 1837 1838 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 1839 1840 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 1841 eor x7, x7, x14 //AES final-1 block - round 10 high 1842 #ifdef __AARCH64EB__ 1843 rev x7, x7 1844 #endif 1845 .L128_dec_blocks_more_than_1: //blocks left > 1 1846 1847 rev64 v4.16b, v5.16b //GHASH final-1 block 1848 1849 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 1850 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1851 1852 mov d22, v4.d[1] //GHASH final-1 block - mid 1853 1854 eor v0.16b, v5.16b, v3.16b //AES final block - result 1855 1856 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 1857 1858 stp x6, x7, [x2], #16 //AES final-1 block - store result 1859 mov x6, v0.d[0] //AES final block - mov low 1860 1861 mov x7, v0.d[1] //AES final block - mov high 1862 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 1863 1864 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 1865 1866 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 1867 1868 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 1869 movi v8.8b, #0 //suppress further partial tag feed in 1870 1871 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 1872 1873 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 1874 eor x7, x7, x14 //AES final block - round 10 high 1875 #ifdef __AARCH64EB__ 1876 rev x7, x7 1877 #endif 1878 eor x6, x6, x13 //AES final block - round 10 low 1879 #ifdef __AARCH64EB__ 1880 rev x6, x6 1881 #endif 1882 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 1883 .L128_dec_blocks_less_than_1: //blocks left <= 1 1884 1885 mvn x14, xzr //rk10_h = 0xffffffffffffffff 1886 and x1, x1, #127 //bit_length %= 128 1887 1888 mvn x13, xzr //rk10_l = 0xffffffffffffffff 1889 sub x1, x1, #128 //bit_length -= 128 1890 1891 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 1892 1893 and x1, x1, #127 //bit_length %= 128 1894 1895 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 1896 cmp x1, #64 1897 1898 csel x10, x14, xzr, lt 1899 csel x9, x13, x14, lt 1900 1901 fmov d0, x9 //ctr0b is mask for last block 1902 1903 mov v0.d[1], x10 1904 1905 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 1906 1907 rev64 v4.16b, v5.16b //GHASH final block 1908 1909 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1910 1911 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 1912 1913 and x7, x7, x10 1914 1915 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 1916 mov d8, v4.d[1] //GHASH final block - mid 1917 1918 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 1919 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 1920 1921 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 1922 1923 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 1924 bic x4, x4, x9 //mask out low existing bytes 1925 and x6, x6, x9 1926 1927 #ifndef __AARCH64EB__ 1928 rev w9, w12 1929 #else 1930 mov w9, w12 1931 #endif 1932 1933 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 1934 movi v8.8b, #0xc2 1935 1936 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 1937 1938 bic x5, x5, x10 //mask out high existing bytes 1939 shl d8, d8, #56 //mod_constant 1940 1941 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1942 1943 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1944 1945 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1946 1947 orr x6, x6, x4 1948 str w9, [x16, #12] //store the updated counter 1949 1950 orr x7, x7, x5 1951 stp x6, x7, [x2] 1952 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1953 1954 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1955 1956 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1957 1958 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1959 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1960 1961 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1962 1963 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1964 ext v11.16b, v11.16b, v11.16b, #8 1965 rev64 v11.16b, v11.16b 1966 mov x0, x15 1967 st1 { v11.16b }, [x3] 1968 1969 ldp x21, x22, [sp, #16] 1970 ldp x23, x24, [sp, #32] 1971 ldp d8, d9, [sp, #48] 1972 ldp d10, d11, [sp, #64] 1973 ldp d12, d13, [sp, #80] 1974 ldp d14, d15, [sp, #96] 1975 ldp x19, x20, [sp], #112 1976 ret 1977 1978 .L128_dec_ret: 1979 mov w0, #0x0 1980 ret 1981 .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 1982 .globl aes_gcm_enc_192_kernel 1983 .type aes_gcm_enc_192_kernel,%function 1984 .align 4 1985 aes_gcm_enc_192_kernel: 1986 AARCH64_VALID_CALL_TARGET 1987 cbz x1, .L192_enc_ret 1988 stp x19, x20, [sp, #-112]! 1989 mov x16, x4 1990 mov x8, x5 1991 stp x21, x22, [sp, #16] 1992 stp x23, x24, [sp, #32] 1993 stp d8, d9, [sp, #48] 1994 stp d10, d11, [sp, #64] 1995 stp d12, d13, [sp, #80] 1996 stp d14, d15, [sp, #96] 1997 1998 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1999 #ifdef __AARCH64EB__ 2000 rev x10, x10 2001 rev x11, x11 2002 #endif 2003 ldp x13, x14, [x8, #192] //load rk12 2004 #ifdef __AARCH64EB__ 2005 ror x13, x13, #32 2006 ror x14, x14, #32 2007 #endif 2008 ld1 {v18.4s}, [x8], #16 //load rk0 2009 2010 ld1 {v19.4s}, [x8], #16 //load rk1 2011 2012 ld1 {v20.4s}, [x8], #16 //load rk2 2013 2014 lsr x12, x11, #32 2015 ld1 {v21.4s}, [x8], #16 //load rk3 2016 orr w11, w11, w11 2017 2018 ld1 {v22.4s}, [x8], #16 //load rk4 2019 rev w12, w12 //rev_ctr32 2020 2021 add w12, w12, #1 //increment rev_ctr32 2022 fmov d3, x10 //CTR block 3 2023 2024 rev w9, w12 //CTR block 1 2025 add w12, w12, #1 //CTR block 1 2026 fmov d1, x10 //CTR block 1 2027 2028 orr x9, x11, x9, lsl #32 //CTR block 1 2029 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 2030 2031 fmov v1.d[1], x9 //CTR block 1 2032 rev w9, w12 //CTR block 2 2033 add w12, w12, #1 //CTR block 2 2034 2035 fmov d2, x10 //CTR block 2 2036 orr x9, x11, x9, lsl #32 //CTR block 2 2037 2038 fmov v2.d[1], x9 //CTR block 2 2039 rev w9, w12 //CTR block 3 2040 2041 orr x9, x11, x9, lsl #32 //CTR block 3 2042 ld1 {v23.4s}, [x8], #16 //load rk5 2043 2044 fmov v3.d[1], x9 //CTR block 3 2045 2046 ld1 {v24.4s}, [x8], #16 //load rk6 2047 2048 ld1 {v25.4s}, [x8], #16 //load rk7 2049 2050 aese v0.16b, v18.16b 2051 aesmc v0.16b, v0.16b //AES block 0 - round 0 2052 ld1 { v11.16b}, [x3] 2053 ext v11.16b, v11.16b, v11.16b, #8 2054 rev64 v11.16b, v11.16b 2055 2056 aese v3.16b, v18.16b 2057 aesmc v3.16b, v3.16b //AES block 3 - round 0 2058 ld1 {v26.4s}, [x8], #16 //load rk8 2059 2060 aese v1.16b, v18.16b 2061 aesmc v1.16b, v1.16b //AES block 1 - round 0 2062 ldr q15, [x3, #112] //load h4l | h4h 2063 #ifndef __AARCH64EB__ 2064 ext v15.16b, v15.16b, v15.16b, #8 2065 #endif 2066 aese v2.16b, v18.16b 2067 aesmc v2.16b, v2.16b //AES block 2 - round 0 2068 ld1 {v27.4s}, [x8], #16 //load rk9 2069 2070 aese v0.16b, v19.16b 2071 aesmc v0.16b, v0.16b //AES block 0 - round 1 2072 ld1 {v28.4s}, [x8], #16 //load rk10 2073 2074 aese v1.16b, v19.16b 2075 aesmc v1.16b, v1.16b //AES block 1 - round 1 2076 ldr q12, [x3, #32] //load h1l | h1h 2077 #ifndef __AARCH64EB__ 2078 ext v12.16b, v12.16b, v12.16b, #8 2079 #endif 2080 aese v2.16b, v19.16b 2081 aesmc v2.16b, v2.16b //AES block 2 - round 1 2082 ld1 {v29.4s}, [x8], #16 //load rk11 2083 2084 aese v3.16b, v19.16b 2085 aesmc v3.16b, v3.16b //AES block 3 - round 1 2086 ldr q14, [x3, #80] //load h3l | h3h 2087 #ifndef __AARCH64EB__ 2088 ext v14.16b, v14.16b, v14.16b, #8 2089 #endif 2090 aese v0.16b, v20.16b 2091 aesmc v0.16b, v0.16b //AES block 0 - round 2 2092 2093 aese v2.16b, v20.16b 2094 aesmc v2.16b, v2.16b //AES block 2 - round 2 2095 2096 aese v3.16b, v20.16b 2097 aesmc v3.16b, v3.16b //AES block 3 - round 2 2098 2099 aese v0.16b, v21.16b 2100 aesmc v0.16b, v0.16b //AES block 0 - round 3 2101 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 2102 2103 aese v2.16b, v21.16b 2104 aesmc v2.16b, v2.16b //AES block 2 - round 3 2105 2106 aese v1.16b, v20.16b 2107 aesmc v1.16b, v1.16b //AES block 1 - round 2 2108 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 2109 2110 aese v0.16b, v22.16b 2111 aesmc v0.16b, v0.16b //AES block 0 - round 4 2112 2113 aese v3.16b, v21.16b 2114 aesmc v3.16b, v3.16b //AES block 3 - round 3 2115 2116 aese v1.16b, v21.16b 2117 aesmc v1.16b, v1.16b //AES block 1 - round 3 2118 2119 aese v0.16b, v23.16b 2120 aesmc v0.16b, v0.16b //AES block 0 - round 5 2121 2122 aese v2.16b, v22.16b 2123 aesmc v2.16b, v2.16b //AES block 2 - round 4 2124 2125 aese v1.16b, v22.16b 2126 aesmc v1.16b, v1.16b //AES block 1 - round 4 2127 2128 aese v0.16b, v24.16b 2129 aesmc v0.16b, v0.16b //AES block 0 - round 6 2130 2131 aese v3.16b, v22.16b 2132 aesmc v3.16b, v3.16b //AES block 3 - round 4 2133 2134 aese v2.16b, v23.16b 2135 aesmc v2.16b, v2.16b //AES block 2 - round 5 2136 2137 aese v1.16b, v23.16b 2138 aesmc v1.16b, v1.16b //AES block 1 - round 5 2139 2140 aese v3.16b, v23.16b 2141 aesmc v3.16b, v3.16b //AES block 3 - round 5 2142 2143 aese v2.16b, v24.16b 2144 aesmc v2.16b, v2.16b //AES block 2 - round 6 2145 ldr q13, [x3, #64] //load h2l | h2h 2146 #ifndef __AARCH64EB__ 2147 ext v13.16b, v13.16b, v13.16b, #8 2148 #endif 2149 aese v1.16b, v24.16b 2150 aesmc v1.16b, v1.16b //AES block 1 - round 6 2151 2152 aese v3.16b, v24.16b 2153 aesmc v3.16b, v3.16b //AES block 3 - round 6 2154 2155 aese v0.16b, v25.16b 2156 aesmc v0.16b, v0.16b //AES block 0 - round 7 2157 2158 aese v1.16b, v25.16b 2159 aesmc v1.16b, v1.16b //AES block 1 - round 7 2160 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 2161 2162 aese v3.16b, v25.16b 2163 aesmc v3.16b, v3.16b //AES block 3 - round 7 2164 2165 aese v0.16b, v26.16b 2166 aesmc v0.16b, v0.16b //AES block 0 - round 8 2167 2168 aese v2.16b, v25.16b 2169 aesmc v2.16b, v2.16b //AES block 2 - round 7 2170 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 2171 2172 aese v1.16b, v26.16b 2173 aesmc v1.16b, v1.16b //AES block 1 - round 8 2174 2175 aese v3.16b, v26.16b 2176 aesmc v3.16b, v3.16b //AES block 3 - round 8 2177 2178 aese v2.16b, v26.16b 2179 aesmc v2.16b, v2.16b //AES block 2 - round 8 2180 2181 aese v0.16b, v27.16b 2182 aesmc v0.16b, v0.16b //AES block 0 - round 9 2183 2184 aese v3.16b, v27.16b 2185 aesmc v3.16b, v3.16b //AES block 3 - round 9 2186 2187 aese v2.16b, v27.16b 2188 aesmc v2.16b, v2.16b //AES block 2 - round 9 2189 2190 aese v1.16b, v27.16b 2191 aesmc v1.16b, v1.16b //AES block 1 - round 9 2192 2193 aese v0.16b, v28.16b 2194 aesmc v0.16b, v0.16b //AES block 0 - round 10 2195 2196 aese v2.16b, v28.16b 2197 aesmc v2.16b, v2.16b //AES block 2 - round 10 2198 2199 aese v1.16b, v28.16b 2200 aesmc v1.16b, v1.16b //AES block 1 - round 10 2201 lsr x5, x1, #3 //byte_len 2202 mov x15, x5 2203 2204 aese v3.16b, v28.16b 2205 aesmc v3.16b, v3.16b //AES block 3 - round 10 2206 sub x5, x5, #1 //byte_len - 1 2207 2208 eor v16.16b, v16.16b, v8.16b //h2k | h1k 2209 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2210 2211 eor v17.16b, v17.16b, v9.16b //h4k | h3k 2212 2213 aese v2.16b, v29.16b //AES block 2 - round 11 2214 add x4, x0, x1, lsr #3 //end_input_ptr 2215 add x5, x5, x0 2216 2217 aese v1.16b, v29.16b //AES block 1 - round 11 2218 cmp x0, x5 //check if we have <= 4 blocks 2219 2220 aese v0.16b, v29.16b //AES block 0 - round 11 2221 add w12, w12, #1 //CTR block 3 2222 2223 aese v3.16b, v29.16b //AES block 3 - round 11 2224 b.ge .L192_enc_tail //handle tail 2225 2226 rev w9, w12 //CTR block 4 2227 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 2228 #ifdef __AARCH64EB__ 2229 rev x6, x6 2230 rev x7, x7 2231 #endif 2232 orr x9, x11, x9, lsl #32 //CTR block 4 2233 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 2234 #ifdef __AARCH64EB__ 2235 rev x21, x21 2236 rev x22, x22 2237 #endif 2238 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 2239 #ifdef __AARCH64EB__ 2240 rev x23, x23 2241 rev x24, x24 2242 #endif 2243 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 2244 #ifdef __AARCH64EB__ 2245 rev x19, x19 2246 rev x20, x20 2247 #endif 2248 add x0, x0, #64 //AES input_ptr update 2249 cmp x0, x5 //check if we have <= 8 blocks 2250 2251 eor x6, x6, x13 //AES block 0 - round 12 low 2252 2253 eor x7, x7, x14 //AES block 0 - round 12 high 2254 eor x22, x22, x14 //AES block 2 - round 12 high 2255 fmov d4, x6 //AES block 0 - mov low 2256 2257 eor x24, x24, x14 //AES block 3 - round 12 high 2258 fmov v4.d[1], x7 //AES block 0 - mov high 2259 2260 eor x21, x21, x13 //AES block 2 - round 12 low 2261 eor x19, x19, x13 //AES block 1 - round 12 low 2262 2263 fmov d5, x19 //AES block 1 - mov low 2264 eor x20, x20, x14 //AES block 1 - round 12 high 2265 2266 fmov v5.d[1], x20 //AES block 1 - mov high 2267 2268 eor x23, x23, x13 //AES block 3 - round 12 low 2269 fmov d6, x21 //AES block 2 - mov low 2270 2271 add w12, w12, #1 //CTR block 4 2272 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 2273 fmov d0, x10 //CTR block 4 2274 2275 fmov v0.d[1], x9 //CTR block 4 2276 rev w9, w12 //CTR block 5 2277 2278 orr x9, x11, x9, lsl #32 //CTR block 5 2279 add w12, w12, #1 //CTR block 5 2280 2281 fmov d7, x23 //AES block 3 - mov low 2282 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 2283 2284 fmov v6.d[1], x22 //AES block 2 - mov high 2285 2286 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 2287 fmov d1, x10 //CTR block 5 2288 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 2289 2290 fmov v7.d[1], x24 //AES block 3 - mov high 2291 2292 fmov v1.d[1], x9 //CTR block 5 2293 rev w9, w12 //CTR block 6 2294 2295 orr x9, x11, x9, lsl #32 //CTR block 6 2296 2297 add w12, w12, #1 //CTR block 6 2298 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 2299 fmov d2, x10 //CTR block 6 2300 2301 fmov v2.d[1], x9 //CTR block 6 2302 rev w9, w12 //CTR block 7 2303 2304 orr x9, x11, x9, lsl #32 //CTR block 7 2305 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 2306 2307 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 2308 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 2309 b.ge .L192_enc_prepretail //do prepretail 2310 2311 .L192_enc_main_loop: //main loop start 2312 aese v2.16b, v18.16b 2313 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2314 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2315 2316 aese v1.16b, v18.16b 2317 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2318 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 2319 #ifdef __AARCH64EB__ 2320 rev x19, x19 2321 rev x20, x20 2322 #endif 2323 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2324 fmov d3, x10 //CTR block 4k+3 2325 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2326 2327 aese v2.16b, v19.16b 2328 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2329 fmov v3.d[1], x9 //CTR block 4k+3 2330 2331 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2332 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2333 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 2334 #ifdef __AARCH64EB__ 2335 rev x21, x21 2336 rev x22, x22 2337 #endif 2338 aese v0.16b, v18.16b 2339 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2340 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 2341 #ifdef __AARCH64EB__ 2342 rev x23, x23 2343 rev x24, x24 2344 #endif 2345 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2346 eor v4.16b, v4.16b, v11.16b //PRE 1 2347 2348 aese v1.16b, v19.16b 2349 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2350 2351 aese v0.16b, v19.16b 2352 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2353 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2354 2355 aese v3.16b, v18.16b 2356 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2357 eor x24, x24, x14 //AES block 4k+3 - round 12 high 2358 2359 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2360 mov d8, v4.d[1] //GHASH block 4k - mid 2361 2362 aese v0.16b, v20.16b 2363 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2364 2365 aese v3.16b, v19.16b 2366 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2367 eor x21, x21, x13 //AES block 4k+6 - round 12 low 2368 2369 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2370 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2371 2372 aese v0.16b, v21.16b 2373 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2374 eor x19, x19, x13 //AES block 4k+5 - round 12 low 2375 2376 aese v1.16b, v20.16b 2377 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2378 mov d31, v6.d[1] //GHASH block 4k+2 - mid 2379 2380 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2381 mov d4, v5.d[1] //GHASH block 4k+1 - mid 2382 2383 aese v2.16b, v20.16b 2384 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2385 2386 aese v1.16b, v21.16b 2387 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2388 2389 mov d10, v17.d[1] //GHASH block 4k - mid 2390 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2391 2392 aese v3.16b, v20.16b 2393 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2394 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2395 2396 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2397 2398 aese v0.16b, v22.16b 2399 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2400 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2401 2402 aese v3.16b, v21.16b 2403 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2404 2405 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2406 eor x20, x20, x14 //AES block 4k+5 - round 12 high 2407 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2408 2409 aese v0.16b, v23.16b 2410 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2411 add w12, w12, #1 //CTR block 4k+3 2412 2413 aese v3.16b, v22.16b 2414 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2415 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2416 2417 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2418 eor x22, x22, x14 //AES block 4k+6 - round 12 high 2419 2420 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2421 eor x23, x23, x13 //AES block 4k+3 - round 12 low 2422 mov d30, v7.d[1] //GHASH block 4k+3 - mid 2423 2424 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2425 rev w9, w12 //CTR block 4k+8 2426 2427 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2428 orr x9, x11, x9, lsl #32 //CTR block 4k+8 2429 2430 aese v2.16b, v21.16b 2431 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2432 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2433 2434 aese v1.16b, v22.16b 2435 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2436 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 2437 #ifdef __AARCH64EB__ 2438 rev x6, x6 2439 rev x7, x7 2440 #endif 2441 aese v0.16b, v24.16b 2442 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2443 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2444 2445 aese v2.16b, v22.16b 2446 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2447 add x0, x0, #64 //AES input_ptr update 2448 2449 aese v1.16b, v23.16b 2450 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2451 movi v8.8b, #0xc2 2452 2453 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2454 eor x7, x7, x14 //AES block 4k+4 - round 12 high 2455 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2456 2457 aese v2.16b, v23.16b 2458 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2459 eor x6, x6, x13 //AES block 4k+4 - round 12 low 2460 2461 aese v1.16b, v24.16b 2462 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2463 shl d8, d8, #56 //mod_constant 2464 2465 aese v3.16b, v23.16b 2466 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2467 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2468 2469 aese v0.16b, v25.16b 2470 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2471 fmov d5, x19 //AES block 4k+5 - mov low 2472 2473 aese v1.16b, v25.16b 2474 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2475 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2476 2477 aese v3.16b, v24.16b 2478 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2479 fmov v5.d[1], x20 //AES block 4k+5 - mov high 2480 2481 aese v0.16b, v26.16b 2482 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2483 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2484 2485 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2486 cmp x0, x5 //.LOOP CONTROL 2487 fmov d4, x6 //AES block 4k+4 - mov low 2488 2489 aese v2.16b, v24.16b 2490 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2491 fmov v4.d[1], x7 //AES block 4k+4 - mov high 2492 2493 aese v1.16b, v26.16b 2494 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2495 fmov d7, x23 //AES block 4k+3 - mov low 2496 2497 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2498 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2499 add w12, w12, #1 //CTR block 4k+8 2500 2501 aese v2.16b, v25.16b 2502 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2503 fmov v7.d[1], x24 //AES block 4k+3 - mov high 2504 2505 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 2506 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 2507 fmov d6, x21 //AES block 4k+6 - mov low 2508 2509 aese v3.16b, v25.16b 2510 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2511 2512 aese v0.16b, v27.16b 2513 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2514 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 2515 2516 aese v2.16b, v26.16b 2517 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2518 2519 aese v3.16b, v26.16b 2520 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2521 2522 aese v1.16b, v27.16b 2523 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2524 2525 aese v0.16b, v28.16b 2526 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2527 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 2528 2529 aese v3.16b, v27.16b 2530 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2531 2532 aese v2.16b, v27.16b 2533 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2534 2535 aese v0.16b, v29.16b //AES block 4k+4 - round 11 2536 2537 aese v1.16b, v28.16b 2538 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2539 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 2540 2541 aese v2.16b, v28.16b 2542 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2543 2544 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 2545 fmov d0, x10 //CTR block 4k+8 2546 2547 aese v1.16b, v29.16b //AES block 4k+5 - round 11 2548 fmov v0.d[1], x9 //CTR block 4k+8 2549 rev w9, w12 //CTR block 4k+9 2550 2551 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 2552 fmov v6.d[1], x22 //AES block 4k+6 - mov high 2553 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 2554 2555 aese v3.16b, v28.16b 2556 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2557 orr x9, x11, x9, lsl #32 //CTR block 4k+9 2558 2559 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 2560 add w12, w12, #1 //CTR block 4k+9 2561 fmov d1, x10 //CTR block 4k+9 2562 2563 aese v2.16b, v29.16b //AES block 4k+6 - round 11 2564 fmov v1.d[1], x9 //CTR block 4k+9 2565 rev w9, w12 //CTR block 4k+10 2566 2567 add w12, w12, #1 //CTR block 4k+10 2568 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 2569 orr x9, x11, x9, lsl #32 //CTR block 4k+10 2570 2571 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 2572 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 2573 2574 aese v3.16b, v29.16b //AES block 4k+7 - round 11 2575 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 2576 fmov d2, x10 //CTR block 4k+10 2577 2578 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 2579 fmov v2.d[1], x9 //CTR block 4k+10 2580 rev w9, w12 //CTR block 4k+11 2581 2582 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 2583 orr x9, x11, x9, lsl #32 //CTR block 4k+11 2584 2585 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 2586 st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 2587 b.lt .L192_enc_main_loop 2588 2589 .L192_enc_prepretail: //PREPRETAIL 2590 aese v0.16b, v18.16b 2591 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2592 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2593 2594 fmov d3, x10 //CTR block 4k+3 2595 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2596 add w12, w12, #1 //CTR block 4k+3 2597 2598 aese v1.16b, v18.16b 2599 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2600 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2601 2602 aese v2.16b, v18.16b 2603 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2604 2605 fmov v3.d[1], x9 //CTR block 4k+3 2606 eor v4.16b, v4.16b, v11.16b //PRE 1 2607 mov d10, v17.d[1] //GHASH block 4k - mid 2608 2609 aese v1.16b, v19.16b 2610 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2611 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2612 2613 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2614 2615 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2616 mov d8, v4.d[1] //GHASH block 4k - mid 2617 2618 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2619 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2620 2621 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2622 2623 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2624 mov d4, v5.d[1] //GHASH block 4k+1 - mid 2625 2626 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2627 mov d31, v6.d[1] //GHASH block 4k+2 - mid 2628 2629 aese v3.16b, v18.16b 2630 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2631 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2632 2633 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2634 2635 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2636 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2637 2638 aese v3.16b, v19.16b 2639 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2640 2641 aese v2.16b, v19.16b 2642 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2643 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2644 2645 aese v0.16b, v19.16b 2646 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2647 2648 aese v1.16b, v20.16b 2649 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2650 mov d30, v7.d[1] //GHASH block 4k+3 - mid 2651 2652 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2653 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2654 2655 aese v0.16b, v20.16b 2656 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2657 2658 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2659 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2660 2661 aese v1.16b, v21.16b 2662 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2663 2664 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2665 2666 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2667 2668 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2669 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2670 2671 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2672 2673 aese v0.16b, v21.16b 2674 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2675 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2676 2677 aese v3.16b, v20.16b 2678 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2679 2680 aese v2.16b, v20.16b 2681 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2682 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2683 2684 aese v0.16b, v22.16b 2685 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2686 2687 aese v3.16b, v21.16b 2688 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2689 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2690 2691 aese v2.16b, v21.16b 2692 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2693 2694 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2695 movi v8.8b, #0xc2 2696 2697 aese v3.16b, v22.16b 2698 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2699 2700 aese v2.16b, v22.16b 2701 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2702 2703 aese v1.16b, v22.16b 2704 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2705 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2706 2707 aese v3.16b, v23.16b 2708 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2709 2710 aese v2.16b, v23.16b 2711 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2712 2713 aese v1.16b, v23.16b 2714 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2715 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2716 2717 aese v0.16b, v23.16b 2718 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2719 2720 aese v3.16b, v24.16b 2721 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2722 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 2723 2724 aese v1.16b, v24.16b 2725 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2726 2727 aese v0.16b, v24.16b 2728 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2729 shl d8, d8, #56 //mod_constant 2730 2731 aese v3.16b, v25.16b 2732 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2733 2734 aese v1.16b, v25.16b 2735 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2736 eor v10.16b, v10.16b, v11.16b 2737 2738 aese v0.16b, v25.16b 2739 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2740 2741 pmull v30.1q, v9.1d, v8.1d 2742 2743 aese v2.16b, v24.16b 2744 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2745 ext v9.16b, v9.16b, v9.16b, #8 2746 2747 aese v0.16b, v26.16b 2748 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2749 2750 aese v1.16b, v26.16b 2751 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2752 eor v10.16b, v10.16b, v30.16b 2753 2754 aese v2.16b, v25.16b 2755 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2756 2757 aese v3.16b, v26.16b 2758 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2759 2760 aese v0.16b, v27.16b 2761 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2762 2763 aese v2.16b, v26.16b 2764 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2765 eor v10.16b, v10.16b, v9.16b 2766 2767 aese v3.16b, v27.16b 2768 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2769 2770 aese v1.16b, v27.16b 2771 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2772 2773 aese v2.16b, v27.16b 2774 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2775 2776 pmull v30.1q, v10.1d, v8.1d 2777 2778 ext v10.16b, v10.16b, v10.16b, #8 2779 2780 aese v3.16b, v28.16b 2781 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2782 2783 aese v0.16b, v28.16b 2784 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2785 2786 aese v2.16b, v28.16b 2787 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2788 2789 aese v1.16b, v28.16b 2790 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2791 eor v11.16b, v11.16b, v30.16b 2792 2793 aese v0.16b, v29.16b //AES block 4k+4 - round 11 2794 2795 aese v3.16b, v29.16b //AES block 4k+7 - round 11 2796 2797 aese v2.16b, v29.16b //AES block 4k+6 - round 11 2798 2799 aese v1.16b, v29.16b //AES block 4k+5 - round 11 2800 eor v11.16b, v11.16b, v10.16b 2801 .L192_enc_tail: //TAIL 2802 2803 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 2804 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 2805 #ifdef __AARCH64EB__ 2806 rev x6, x6 2807 rev x7, x7 2808 #endif 2809 eor x6, x6, x13 //AES block 4k+4 - round 12 low 2810 eor x7, x7, x14 //AES block 4k+4 - round 12 high 2811 2812 fmov d4, x6 //AES block 4k+4 - mov low 2813 2814 fmov v4.d[1], x7 //AES block 4k+4 - mov high 2815 cmp x5, #48 2816 2817 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 2818 2819 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 2820 b.gt .L192_enc_blocks_more_than_3 2821 2822 sub w12, w12, #1 2823 movi v10.8b, #0 2824 2825 mov v3.16b, v2.16b 2826 movi v9.8b, #0 2827 cmp x5, #32 2828 2829 mov v2.16b, v1.16b 2830 movi v11.8b, #0 2831 b.gt .L192_enc_blocks_more_than_2 2832 2833 sub w12, w12, #1 2834 2835 mov v3.16b, v1.16b 2836 cmp x5, #16 2837 b.gt .L192_enc_blocks_more_than_1 2838 2839 sub w12, w12, #1 2840 b .L192_enc_blocks_less_than_1 2841 .L192_enc_blocks_more_than_3: //blocks left > 3 2842 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 2843 2844 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 2845 #ifdef __AARCH64EB__ 2846 rev x6, x6 2847 rev x7, x7 2848 #endif 2849 rev64 v4.16b, v5.16b //GHASH final-3 block 2850 2851 eor x6, x6, x13 //AES final-2 block - round 12 low 2852 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2853 2854 eor x7, x7, x14 //AES final-2 block - round 12 high 2855 fmov d5, x6 //AES final-2 block - mov low 2856 2857 fmov v5.d[1], x7 //AES final-2 block - mov high 2858 2859 mov d22, v4.d[1] //GHASH final-3 block - mid 2860 2861 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 2862 2863 mov d10, v17.d[1] //GHASH final-3 block - mid 2864 2865 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 2866 2867 movi v8.8b, #0 //suppress further partial tag feed in 2868 2869 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 2870 2871 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 2872 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 2873 .L192_enc_blocks_more_than_2: //blocks left > 2 2874 2875 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 2876 2877 rev64 v4.16b, v5.16b //GHASH final-2 block 2878 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 2879 #ifdef __AARCH64EB__ 2880 rev x6, x6 2881 rev x7, x7 2882 #endif 2883 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2884 2885 eor x7, x7, x14 //AES final-1 block - round 12 high 2886 2887 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 2888 mov d22, v4.d[1] //GHASH final-2 block - mid 2889 2890 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 2891 eor x6, x6, x13 //AES final-1 block - round 12 low 2892 2893 fmov d5, x6 //AES final-1 block - mov low 2894 2895 fmov v5.d[1], x7 //AES final-1 block - mov high 2896 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 2897 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 2898 2899 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 2900 2901 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 2902 2903 movi v8.8b, #0 //suppress further partial tag feed in 2904 2905 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 2906 2907 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 2908 .L192_enc_blocks_more_than_1: //blocks left > 1 2909 2910 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 2911 2912 ldp x6, x7, [x0], #16 //AES final block - load input low & high 2913 #ifdef __AARCH64EB__ 2914 rev x6, x6 2915 rev x7, x7 2916 #endif 2917 rev64 v4.16b, v5.16b //GHASH final-1 block 2918 2919 eor x6, x6, x13 //AES final block - round 12 low 2920 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2921 movi v8.8b, #0 //suppress further partial tag feed in 2922 2923 mov d22, v4.d[1] //GHASH final-1 block - mid 2924 2925 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 2926 eor x7, x7, x14 //AES final block - round 12 high 2927 fmov d5, x6 //AES final block - mov low 2928 2929 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 2930 fmov v5.d[1], x7 //AES final block - mov high 2931 2932 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 2933 2934 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 2935 2936 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 2937 2938 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 2939 2940 eor v5.16b, v5.16b, v3.16b //AES final block - result 2941 2942 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 2943 2944 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 2945 .L192_enc_blocks_less_than_1: //blocks left <= 1 2946 2947 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 2948 #ifndef __AARCH64EB__ 2949 rev w9, w12 2950 #else 2951 mov w9, w12 2952 #endif 2953 and x1, x1, #127 //bit_length %= 128 2954 2955 sub x1, x1, #128 //bit_length -= 128 2956 mvn x14, xzr //rk12_h = 0xffffffffffffffff 2957 2958 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 2959 mvn x13, xzr //rk12_l = 0xffffffffffffffff 2960 2961 and x1, x1, #127 //bit_length %= 128 2962 2963 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 2964 cmp x1, #64 2965 2966 csel x6, x13, x14, lt 2967 csel x7, x14, xzr, lt 2968 2969 fmov d0, x6 //ctr0b is mask for last block 2970 2971 fmov v0.d[1], x7 2972 2973 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 2974 2975 rev64 v4.16b, v5.16b //GHASH final block 2976 2977 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2978 2979 mov d8, v4.d[1] //GHASH final block - mid 2980 2981 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 2982 2983 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 2984 2985 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 2986 2987 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 2988 2989 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 2990 2991 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 2992 2993 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 2994 movi v8.8b, #0xc2 2995 2996 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2997 2998 shl d8, d8, #56 //mod_constant 2999 3000 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 3001 3002 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3003 3004 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3005 3006 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3007 3008 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3009 3010 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3011 3012 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3013 3014 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3015 3016 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 3017 str w9, [x16, #12] //store the updated counter 3018 3019 st1 { v5.16b}, [x2] //store all 16B 3020 3021 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3022 ext v11.16b, v11.16b, v11.16b, #8 3023 rev64 v11.16b, v11.16b 3024 mov x0, x15 3025 st1 { v11.16b }, [x3] 3026 3027 ldp x21, x22, [sp, #16] 3028 ldp x23, x24, [sp, #32] 3029 ldp d8, d9, [sp, #48] 3030 ldp d10, d11, [sp, #64] 3031 ldp d12, d13, [sp, #80] 3032 ldp d14, d15, [sp, #96] 3033 ldp x19, x20, [sp], #112 3034 ret 3035 3036 .L192_enc_ret: 3037 mov w0, #0x0 3038 ret 3039 .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3040 .globl aes_gcm_dec_192_kernel 3041 .type aes_gcm_dec_192_kernel,%function 3042 .align 4 3043 aes_gcm_dec_192_kernel: 3044 AARCH64_VALID_CALL_TARGET 3045 cbz x1, .L192_dec_ret 3046 stp x19, x20, [sp, #-112]! 3047 mov x16, x4 3048 mov x8, x5 3049 stp x21, x22, [sp, #16] 3050 stp x23, x24, [sp, #32] 3051 stp d8, d9, [sp, #48] 3052 stp d10, d11, [sp, #64] 3053 stp d12, d13, [sp, #80] 3054 stp d14, d15, [sp, #96] 3055 3056 add x4, x0, x1, lsr #3 //end_input_ptr 3057 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 3058 #ifdef __AARCH64EB__ 3059 rev x10, x10 3060 rev x11, x11 3061 #endif 3062 ldp x13, x14, [x8, #192] //load rk12 3063 #ifdef __AARCH64EB__ 3064 ror x13, x13, #32 3065 ror x14, x14, #32 3066 #endif 3067 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 3068 3069 ld1 {v18.4s}, [x8], #16 //load rk0 3070 3071 lsr x5, x1, #3 //byte_len 3072 mov x15, x5 3073 ld1 {v19.4s}, [x8], #16 //load rk1 3074 3075 lsr x12, x11, #32 3076 orr w11, w11, w11 3077 fmov d3, x10 //CTR block 3 3078 3079 rev w12, w12 //rev_ctr32 3080 fmov d1, x10 //CTR block 1 3081 3082 add w12, w12, #1 //increment rev_ctr32 3083 ld1 {v20.4s}, [x8], #16 //load rk2 3084 3085 aese v0.16b, v18.16b 3086 aesmc v0.16b, v0.16b //AES block 0 - round 0 3087 rev w9, w12 //CTR block 1 3088 3089 add w12, w12, #1 //CTR block 1 3090 orr x9, x11, x9, lsl #32 //CTR block 1 3091 ld1 {v21.4s}, [x8], #16 //load rk3 3092 3093 fmov v1.d[1], x9 //CTR block 1 3094 rev w9, w12 //CTR block 2 3095 add w12, w12, #1 //CTR block 2 3096 3097 fmov d2, x10 //CTR block 2 3098 orr x9, x11, x9, lsl #32 //CTR block 2 3099 3100 fmov v2.d[1], x9 //CTR block 2 3101 rev w9, w12 //CTR block 3 3102 3103 aese v0.16b, v19.16b 3104 aesmc v0.16b, v0.16b //AES block 0 - round 1 3105 orr x9, x11, x9, lsl #32 //CTR block 3 3106 3107 fmov v3.d[1], x9 //CTR block 3 3108 3109 ld1 {v22.4s}, [x8], #16 //load rk4 3110 3111 aese v0.16b, v20.16b 3112 aesmc v0.16b, v0.16b //AES block 0 - round 2 3113 3114 aese v2.16b, v18.16b 3115 aesmc v2.16b, v2.16b //AES block 2 - round 0 3116 ld1 {v23.4s}, [x8], #16 //load rk5 3117 3118 aese v1.16b, v18.16b 3119 aesmc v1.16b, v1.16b //AES block 1 - round 0 3120 ldr q15, [x3, #112] //load h4l | h4h 3121 #ifndef __AARCH64EB__ 3122 ext v15.16b, v15.16b, v15.16b, #8 3123 #endif 3124 aese v3.16b, v18.16b 3125 aesmc v3.16b, v3.16b //AES block 3 - round 0 3126 ldr q13, [x3, #64] //load h2l | h2h 3127 #ifndef __AARCH64EB__ 3128 ext v13.16b, v13.16b, v13.16b, #8 3129 #endif 3130 aese v2.16b, v19.16b 3131 aesmc v2.16b, v2.16b //AES block 2 - round 1 3132 ldr q14, [x3, #80] //load h3l | h3h 3133 #ifndef __AARCH64EB__ 3134 ext v14.16b, v14.16b, v14.16b, #8 3135 #endif 3136 aese v1.16b, v19.16b 3137 aesmc v1.16b, v1.16b //AES block 1 - round 1 3138 3139 aese v3.16b, v19.16b 3140 aesmc v3.16b, v3.16b //AES block 3 - round 1 3141 ldr q12, [x3, #32] //load h1l | h1h 3142 #ifndef __AARCH64EB__ 3143 ext v12.16b, v12.16b, v12.16b, #8 3144 #endif 3145 aese v2.16b, v20.16b 3146 aesmc v2.16b, v2.16b //AES block 2 - round 2 3147 ld1 {v24.4s}, [x8], #16 //load rk6 3148 3149 aese v0.16b, v21.16b 3150 aesmc v0.16b, v0.16b //AES block 0 - round 3 3151 ld1 {v25.4s}, [x8], #16 //load rk7 3152 3153 aese v1.16b, v20.16b 3154 aesmc v1.16b, v1.16b //AES block 1 - round 2 3155 ld1 {v26.4s}, [x8], #16 //load rk8 3156 3157 aese v3.16b, v20.16b 3158 aesmc v3.16b, v3.16b //AES block 3 - round 2 3159 ld1 {v27.4s}, [x8], #16 //load rk9 3160 3161 aese v2.16b, v21.16b 3162 aesmc v2.16b, v2.16b //AES block 2 - round 3 3163 ld1 { v11.16b}, [x3] 3164 ext v11.16b, v11.16b, v11.16b, #8 3165 rev64 v11.16b, v11.16b 3166 3167 aese v1.16b, v21.16b 3168 aesmc v1.16b, v1.16b //AES block 1 - round 3 3169 add w12, w12, #1 //CTR block 3 3170 3171 aese v3.16b, v21.16b 3172 aesmc v3.16b, v3.16b //AES block 3 - round 3 3173 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 3174 3175 aese v0.16b, v22.16b 3176 aesmc v0.16b, v0.16b //AES block 0 - round 4 3177 ld1 {v28.4s}, [x8], #16 //load rk10 3178 3179 aese v1.16b, v22.16b 3180 aesmc v1.16b, v1.16b //AES block 1 - round 4 3181 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 3182 3183 aese v2.16b, v22.16b 3184 aesmc v2.16b, v2.16b //AES block 2 - round 4 3185 3186 aese v3.16b, v22.16b 3187 aesmc v3.16b, v3.16b //AES block 3 - round 4 3188 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 3189 3190 aese v0.16b, v23.16b 3191 aesmc v0.16b, v0.16b //AES block 0 - round 5 3192 ld1 {v29.4s}, [x8], #16 //load rk11 3193 3194 aese v1.16b, v23.16b 3195 aesmc v1.16b, v1.16b //AES block 1 - round 5 3196 3197 aese v2.16b, v23.16b 3198 aesmc v2.16b, v2.16b //AES block 2 - round 5 3199 3200 aese v3.16b, v23.16b 3201 aesmc v3.16b, v3.16b //AES block 3 - round 5 3202 3203 aese v0.16b, v24.16b 3204 aesmc v0.16b, v0.16b //AES block 0 - round 6 3205 3206 aese v2.16b, v24.16b 3207 aesmc v2.16b, v2.16b //AES block 2 - round 6 3208 3209 aese v3.16b, v24.16b 3210 aesmc v3.16b, v3.16b //AES block 3 - round 6 3211 3212 aese v0.16b, v25.16b 3213 aesmc v0.16b, v0.16b //AES block 0 - round 7 3214 3215 aese v2.16b, v25.16b 3216 aesmc v2.16b, v2.16b //AES block 2 - round 7 3217 3218 aese v3.16b, v25.16b 3219 aesmc v3.16b, v3.16b //AES block 3 - round 7 3220 3221 aese v1.16b, v24.16b 3222 aesmc v1.16b, v1.16b //AES block 1 - round 6 3223 3224 aese v2.16b, v26.16b 3225 aesmc v2.16b, v2.16b //AES block 2 - round 8 3226 3227 aese v3.16b, v26.16b 3228 aesmc v3.16b, v3.16b //AES block 3 - round 8 3229 3230 aese v1.16b, v25.16b 3231 aesmc v1.16b, v1.16b //AES block 1 - round 7 3232 3233 aese v2.16b, v27.16b 3234 aesmc v2.16b, v2.16b //AES block 2 - round 9 3235 3236 aese v3.16b, v27.16b 3237 aesmc v3.16b, v3.16b //AES block 3 - round 9 3238 3239 aese v1.16b, v26.16b 3240 aesmc v1.16b, v1.16b //AES block 1 - round 8 3241 sub x5, x5, #1 //byte_len - 1 3242 3243 aese v0.16b, v26.16b 3244 aesmc v0.16b, v0.16b //AES block 0 - round 8 3245 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3246 3247 aese v3.16b, v28.16b 3248 aesmc v3.16b, v3.16b //AES block 3 - round 10 3249 add x5, x5, x0 3250 3251 aese v1.16b, v27.16b 3252 aesmc v1.16b, v1.16b //AES block 1 - round 9 3253 cmp x0, x5 //check if we have <= 4 blocks 3254 3255 aese v0.16b, v27.16b 3256 aesmc v0.16b, v0.16b //AES block 0 - round 9 3257 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 3258 3259 aese v3.16b, v29.16b //AES block 3 - round 11 3260 3261 aese v2.16b, v28.16b 3262 aesmc v2.16b, v2.16b //AES block 2 - round 10 3263 3264 aese v1.16b, v28.16b 3265 aesmc v1.16b, v1.16b //AES block 1 - round 10 3266 3267 aese v0.16b, v28.16b 3268 aesmc v0.16b, v0.16b //AES block 0 - round 10 3269 eor v16.16b, v16.16b, v8.16b //h2k | h1k 3270 3271 aese v2.16b, v29.16b //AES block 2 - round 11 3272 3273 aese v1.16b, v29.16b //AES block 1 - round 11 3274 eor v17.16b, v17.16b, v9.16b //h4k | h3k 3275 3276 aese v0.16b, v29.16b //AES block 0 - round 11 3277 b.ge .L192_dec_tail //handle tail 3278 3279 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 3280 3281 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 3282 3283 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 3284 rev w9, w12 //CTR block 4 3285 ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext 3286 3287 mov x19, v1.d[0] //AES block 1 - mov low 3288 3289 mov x20, v1.d[1] //AES block 1 - mov high 3290 3291 mov x6, v0.d[0] //AES block 0 - mov low 3292 orr x9, x11, x9, lsl #32 //CTR block 4 3293 add w12, w12, #1 //CTR block 4 3294 3295 mov x7, v0.d[1] //AES block 0 - mov high 3296 rev64 v4.16b, v4.16b //GHASH block 0 3297 3298 fmov d0, x10 //CTR block 4 3299 rev64 v5.16b, v5.16b //GHASH block 1 3300 cmp x0, x5 //check if we have <= 8 blocks 3301 3302 eor x19, x19, x13 //AES block 1 - round 12 low 3303 #ifdef __AARCH64EB__ 3304 rev x19, x19 3305 #endif 3306 fmov v0.d[1], x9 //CTR block 4 3307 rev w9, w12 //CTR block 5 3308 3309 orr x9, x11, x9, lsl #32 //CTR block 5 3310 fmov d1, x10 //CTR block 5 3311 eor x20, x20, x14 //AES block 1 - round 12 high 3312 #ifdef __AARCH64EB__ 3313 rev x20, x20 3314 #endif 3315 add w12, w12, #1 //CTR block 5 3316 fmov v1.d[1], x9 //CTR block 5 3317 eor x6, x6, x13 //AES block 0 - round 12 low 3318 #ifdef __AARCH64EB__ 3319 rev x6, x6 3320 #endif 3321 rev w9, w12 //CTR block 6 3322 eor x7, x7, x14 //AES block 0 - round 12 high 3323 #ifdef __AARCH64EB__ 3324 rev x7, x7 3325 #endif 3326 stp x6, x7, [x2], #16 //AES block 0 - store result 3327 orr x9, x11, x9, lsl #32 //CTR block 6 3328 3329 stp x19, x20, [x2], #16 //AES block 1 - store result 3330 3331 add w12, w12, #1 //CTR block 6 3332 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 3333 b.ge .L192_dec_prepretail //do prepretail 3334 3335 .L192_dec_main_loop: //main loop start 3336 aese v1.16b, v18.16b 3337 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3338 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3339 3340 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3341 mov x21, v2.d[0] //AES block 4k+2 - mov low 3342 3343 mov x22, v2.d[1] //AES block 4k+2 - mov high 3344 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3345 rev64 v7.16b, v7.16b //GHASH block 4k+3 3346 3347 aese v1.16b, v19.16b 3348 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3349 fmov d2, x10 //CTR block 4k+6 3350 3351 aese v0.16b, v18.16b 3352 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3353 eor v4.16b, v4.16b, v11.16b //PRE 1 3354 3355 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3356 fmov v2.d[1], x9 //CTR block 4k+6 3357 3358 aese v1.16b, v20.16b 3359 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3360 mov x24, v3.d[1] //AES block 4k+3 - mov high 3361 3362 aese v0.16b, v19.16b 3363 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3364 mov x23, v3.d[0] //AES block 4k+3 - mov low 3365 3366 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3367 fmov d3, x10 //CTR block 4k+7 3368 mov d8, v4.d[1] //GHASH block 4k - mid 3369 3370 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3371 mov d10, v17.d[1] //GHASH block 4k - mid 3372 rev w9, w12 //CTR block 4k+7 3373 3374 aese v2.16b, v18.16b 3375 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3376 orr x9, x11, x9, lsl #32 //CTR block 4k+7 3377 3378 fmov v3.d[1], x9 //CTR block 4k+7 3379 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3380 mov d4, v5.d[1] //GHASH block 4k+1 - mid 3381 3382 aese v1.16b, v21.16b 3383 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3384 3385 aese v0.16b, v20.16b 3386 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3387 eor x22, x22, x14 //AES block 4k+2 - round 12 high 3388 #ifdef __AARCH64EB__ 3389 rev x22, x22 3390 #endif 3391 aese v2.16b, v19.16b 3392 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3393 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3394 3395 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3396 3397 aese v3.16b, v18.16b 3398 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3399 rev64 v6.16b, v6.16b //GHASH block 4k+2 3400 3401 aese v2.16b, v20.16b 3402 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3403 3404 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3405 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3406 eor x21, x21, x13 //AES block 4k+2 - round 12 low 3407 #ifdef __AARCH64EB__ 3408 rev x21, x21 3409 #endif 3410 aese v1.16b, v22.16b 3411 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3412 3413 aese v0.16b, v21.16b 3414 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3415 3416 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3417 mov d31, v6.d[1] //GHASH block 4k+2 - mid 3418 3419 aese v3.16b, v19.16b 3420 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3421 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3422 3423 aese v0.16b, v22.16b 3424 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3425 3426 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3427 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3428 3429 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3430 3431 aese v0.16b, v23.16b 3432 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3433 3434 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3435 mov d30, v7.d[1] //GHASH block 4k+3 - mid 3436 3437 aese v1.16b, v23.16b 3438 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3439 3440 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3441 3442 aese v3.16b, v20.16b 3443 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3444 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3445 3446 aese v1.16b, v24.16b 3447 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3448 3449 aese v0.16b, v24.16b 3450 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3451 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3452 3453 aese v3.16b, v21.16b 3454 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3455 3456 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3457 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3458 3459 aese v0.16b, v25.16b 3460 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3461 3462 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3463 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3464 3465 aese v1.16b, v25.16b 3466 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3467 3468 aese v0.16b, v26.16b 3469 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3470 movi v8.8b, #0xc2 3471 3472 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3473 3474 aese v1.16b, v26.16b 3475 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3476 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3477 3478 aese v2.16b, v21.16b 3479 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3480 3481 aese v0.16b, v27.16b 3482 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3483 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3484 3485 aese v3.16b, v22.16b 3486 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3487 3488 aese v2.16b, v22.16b 3489 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3490 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3491 3492 aese v0.16b, v28.16b 3493 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3494 3495 aese v1.16b, v27.16b 3496 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3497 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3498 3499 aese v2.16b, v23.16b 3500 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3501 3502 aese v3.16b, v23.16b 3503 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3504 shl d8, d8, #56 //mod_constant 3505 3506 aese v1.16b, v28.16b 3507 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3508 3509 aese v2.16b, v24.16b 3510 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3511 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3512 3513 aese v3.16b, v24.16b 3514 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3515 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3516 3517 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3518 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 3519 eor x23, x23, x13 //AES block 4k+3 - round 12 low 3520 #ifdef __AARCH64EB__ 3521 rev x23, x23 3522 #endif 3523 aese v2.16b, v25.16b 3524 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3525 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3526 3527 aese v0.16b, v29.16b //AES block 4k+4 - round 11 3528 add w12, w12, #1 //CTR block 4k+7 3529 3530 aese v3.16b, v25.16b 3531 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3532 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3533 3534 aese v2.16b, v26.16b 3535 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3536 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 3537 3538 aese v1.16b, v29.16b //AES block 4k+5 - round 11 3539 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 3540 rev w9, w12 //CTR block 4k+8 3541 3542 aese v3.16b, v26.16b 3543 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3544 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3545 3546 aese v2.16b, v27.16b 3547 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3548 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3549 3550 cmp x0, x5 //.LOOP CONTROL 3551 3552 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 3553 eor x24, x24, x14 //AES block 4k+3 - round 12 high 3554 #ifdef __AARCH64EB__ 3555 rev x24, x24 3556 #endif 3557 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 3558 3559 aese v2.16b, v28.16b 3560 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3561 orr x9, x11, x9, lsl #32 //CTR block 4k+8 3562 3563 aese v3.16b, v27.16b 3564 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3565 3566 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3567 mov x19, v1.d[0] //AES block 4k+5 - mov low 3568 3569 mov x6, v0.d[0] //AES block 4k+4 - mov low 3570 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3571 rev64 v5.16b, v5.16b //GHASH block 4k+5 3572 3573 aese v2.16b, v29.16b //AES block 4k+6 - round 11 3574 mov x7, v0.d[1] //AES block 4k+4 - mov high 3575 3576 aese v3.16b, v28.16b 3577 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3578 mov x20, v1.d[1] //AES block 4k+5 - mov high 3579 3580 fmov d0, x10 //CTR block 4k+8 3581 add w12, w12, #1 //CTR block 4k+8 3582 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3583 3584 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 3585 fmov v0.d[1], x9 //CTR block 4k+8 3586 rev w9, w12 //CTR block 4k+9 3587 3588 eor x6, x6, x13 //AES block 4k+4 - round 12 low 3589 #ifdef __AARCH64EB__ 3590 rev x6, x6 3591 #endif 3592 orr x9, x11, x9, lsl #32 //CTR block 4k+9 3593 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3594 3595 fmov d1, x10 //CTR block 4k+9 3596 add w12, w12, #1 //CTR block 4k+9 3597 eor x19, x19, x13 //AES block 4k+5 - round 12 low 3598 #ifdef __AARCH64EB__ 3599 rev x19, x19 3600 #endif 3601 fmov v1.d[1], x9 //CTR block 4k+9 3602 rev w9, w12 //CTR block 4k+10 3603 eor x20, x20, x14 //AES block 4k+5 - round 12 high 3604 #ifdef __AARCH64EB__ 3605 rev x20, x20 3606 #endif 3607 eor x7, x7, x14 //AES block 4k+4 - round 12 high 3608 #ifdef __AARCH64EB__ 3609 rev x7, x7 3610 #endif 3611 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 3612 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3613 3614 add w12, w12, #1 //CTR block 4k+10 3615 rev64 v4.16b, v4.16b //GHASH block 4k+4 3616 orr x9, x11, x9, lsl #32 //CTR block 4k+10 3617 3618 aese v3.16b, v29.16b //AES block 4k+7 - round 11 3619 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 3620 b.lt .L192_dec_main_loop 3621 3622 .L192_dec_prepretail: //PREPRETAIL 3623 mov x22, v2.d[1] //AES block 4k+2 - mov high 3624 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3625 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3626 3627 aese v1.16b, v18.16b 3628 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3629 mov x21, v2.d[0] //AES block 4k+2 - mov low 3630 3631 aese v0.16b, v18.16b 3632 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3633 mov d10, v17.d[1] //GHASH block 4k - mid 3634 3635 eor v4.16b, v4.16b, v11.16b //PRE 1 3636 fmov d2, x10 //CTR block 4k+6 3637 3638 aese v1.16b, v19.16b 3639 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3640 mov x23, v3.d[0] //AES block 4k+3 - mov low 3641 3642 aese v0.16b, v19.16b 3643 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3644 mov x24, v3.d[1] //AES block 4k+3 - mov high 3645 3646 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3647 mov d8, v4.d[1] //GHASH block 4k - mid 3648 fmov d3, x10 //CTR block 4k+7 3649 3650 aese v1.16b, v20.16b 3651 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3652 rev64 v6.16b, v6.16b //GHASH block 4k+2 3653 3654 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3655 fmov v2.d[1], x9 //CTR block 4k+6 3656 rev w9, w12 //CTR block 4k+7 3657 3658 orr x9, x11, x9, lsl #32 //CTR block 4k+7 3659 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3660 mov d4, v5.d[1] //GHASH block 4k+1 - mid 3661 3662 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3663 eor x24, x24, x14 //AES block 4k+3 - round 12 high 3664 #ifdef __AARCH64EB__ 3665 rev x24, x24 3666 #endif 3667 fmov v3.d[1], x9 //CTR block 4k+7 3668 3669 aese v0.16b, v20.16b 3670 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3671 eor x21, x21, x13 //AES block 4k+2 - round 12 low 3672 #ifdef __AARCH64EB__ 3673 rev x21, x21 3674 #endif 3675 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3676 eor x22, x22, x14 //AES block 4k+2 - round 12 high 3677 #ifdef __AARCH64EB__ 3678 rev x22, x22 3679 #endif 3680 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3681 3682 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3683 eor x23, x23, x13 //AES block 4k+3 - round 12 low 3684 #ifdef __AARCH64EB__ 3685 rev x23, x23 3686 #endif 3687 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3688 3689 rev64 v7.16b, v7.16b //GHASH block 4k+3 3690 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3691 3692 aese v3.16b, v18.16b 3693 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3694 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3695 3696 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3697 add w12, w12, #1 //CTR block 4k+7 3698 3699 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3700 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3701 3702 aese v2.16b, v18.16b 3703 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3704 3705 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3706 mov d31, v6.d[1] //GHASH block 4k+2 - mid 3707 3708 aese v3.16b, v19.16b 3709 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3710 3711 aese v2.16b, v19.16b 3712 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3713 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3714 3715 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3716 3717 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3718 3719 aese v2.16b, v20.16b 3720 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3721 mov d30, v7.d[1] //GHASH block 4k+3 - mid 3722 3723 aese v3.16b, v20.16b 3724 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3725 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3726 3727 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3728 3729 aese v0.16b, v21.16b 3730 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3731 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3732 3733 aese v1.16b, v21.16b 3734 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3735 3736 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3737 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3738 3739 aese v0.16b, v22.16b 3740 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3741 3742 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3743 movi v8.8b, #0xc2 3744 3745 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3746 3747 aese v2.16b, v21.16b 3748 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3749 3750 shl d8, d8, #56 //mod_constant 3751 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3752 3753 aese v0.16b, v23.16b 3754 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3755 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3756 3757 aese v2.16b, v22.16b 3758 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3759 3760 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3761 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3762 3763 aese v0.16b, v24.16b 3764 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3765 3766 aese v3.16b, v21.16b 3767 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3768 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3769 3770 aese v2.16b, v23.16b 3771 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3772 3773 aese v0.16b, v25.16b 3774 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3775 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3776 3777 aese v3.16b, v22.16b 3778 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3779 3780 aese v2.16b, v24.16b 3781 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3782 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3783 3784 aese v0.16b, v26.16b 3785 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3786 3787 aese v3.16b, v23.16b 3788 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3789 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3790 3791 aese v1.16b, v22.16b 3792 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3793 3794 aese v2.16b, v25.16b 3795 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3796 3797 aese v0.16b, v27.16b 3798 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3799 3800 aese v1.16b, v23.16b 3801 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3802 3803 aese v3.16b, v24.16b 3804 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3805 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3806 3807 aese v0.16b, v28.16b 3808 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3809 3810 aese v1.16b, v24.16b 3811 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3812 3813 aese v3.16b, v25.16b 3814 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3815 3816 aese v2.16b, v26.16b 3817 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3818 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3819 3820 aese v1.16b, v25.16b 3821 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3822 3823 aese v3.16b, v26.16b 3824 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3825 3826 aese v2.16b, v27.16b 3827 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3828 3829 aese v1.16b, v26.16b 3830 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3831 3832 aese v3.16b, v27.16b 3833 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3834 3835 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3836 3837 aese v1.16b, v27.16b 3838 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3839 3840 aese v2.16b, v28.16b 3841 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3842 3843 aese v3.16b, v28.16b 3844 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3845 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3846 3847 aese v1.16b, v28.16b 3848 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3849 3850 aese v0.16b, v29.16b 3851 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3852 3853 aese v2.16b, v29.16b 3854 3855 aese v1.16b, v29.16b 3856 3857 aese v3.16b, v29.16b 3858 3859 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3860 .L192_dec_tail: //TAIL 3861 3862 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 3863 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3864 3865 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 3866 3867 mov x7, v0.d[1] //AES block 4k+4 - mov high 3868 3869 mov x6, v0.d[0] //AES block 4k+4 - mov low 3870 3871 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 3872 3873 cmp x5, #48 3874 3875 eor x7, x7, x14 //AES block 4k+4 - round 12 high 3876 #ifdef __AARCH64EB__ 3877 rev x7, x7 3878 #endif 3879 eor x6, x6, x13 //AES block 4k+4 - round 12 low 3880 #ifdef __AARCH64EB__ 3881 rev x6, x6 3882 #endif 3883 b.gt .L192_dec_blocks_more_than_3 3884 3885 movi v11.8b, #0 3886 movi v9.8b, #0 3887 3888 mov v3.16b, v2.16b 3889 mov v2.16b, v1.16b 3890 sub w12, w12, #1 3891 3892 movi v10.8b, #0 3893 cmp x5, #32 3894 b.gt .L192_dec_blocks_more_than_2 3895 3896 mov v3.16b, v1.16b 3897 cmp x5, #16 3898 sub w12, w12, #1 3899 3900 b.gt .L192_dec_blocks_more_than_1 3901 3902 sub w12, w12, #1 3903 b .L192_dec_blocks_less_than_1 3904 .L192_dec_blocks_more_than_3: //blocks left > 3 3905 rev64 v4.16b, v5.16b //GHASH final-3 block 3906 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 3907 3908 stp x6, x7, [x2], #16 //AES final-3 block - store result 3909 3910 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3911 3912 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 3913 3914 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 3915 mov x6, v0.d[0] //AES final-2 block - mov low 3916 mov d22, v4.d[1] //GHASH final-3 block - mid 3917 3918 mov x7, v0.d[1] //AES final-2 block - mov high 3919 3920 mov d10, v17.d[1] //GHASH final-3 block - mid 3921 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 3922 3923 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 3924 3925 eor x6, x6, x13 //AES final-2 block - round 12 low 3926 #ifdef __AARCH64EB__ 3927 rev x6, x6 3928 #endif 3929 movi v8.8b, #0 //suppress further partial tag feed in 3930 3931 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 3932 eor x7, x7, x14 //AES final-2 block - round 12 high 3933 #ifdef __AARCH64EB__ 3934 rev x7, x7 3935 #endif 3936 .L192_dec_blocks_more_than_2: //blocks left > 2 3937 3938 rev64 v4.16b, v5.16b //GHASH final-2 block 3939 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 3940 3941 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3942 3943 movi v8.8b, #0 //suppress further partial tag feed in 3944 3945 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 3946 3947 mov d22, v4.d[1] //GHASH final-2 block - mid 3948 3949 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 3950 3951 stp x6, x7, [x2], #16 //AES final-2 block - store result 3952 3953 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 3954 mov x7, v0.d[1] //AES final-1 block - mov high 3955 3956 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 3957 mov x6, v0.d[0] //AES final-1 block - mov low 3958 3959 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 3960 3961 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 3962 3963 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 3964 eor x7, x7, x14 //AES final-1 block - round 12 high 3965 #ifdef __AARCH64EB__ 3966 rev x7, x7 3967 #endif 3968 eor x6, x6, x13 //AES final-1 block - round 12 low 3969 #ifdef __AARCH64EB__ 3970 rev x6, x6 3971 #endif 3972 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 3973 .L192_dec_blocks_more_than_1: //blocks left > 1 3974 3975 rev64 v4.16b, v5.16b //GHASH final-1 block 3976 3977 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3978 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 3979 3980 mov d22, v4.d[1] //GHASH final-1 block - mid 3981 3982 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 3983 3984 eor v0.16b, v5.16b, v3.16b //AES final block - result 3985 stp x6, x7, [x2], #16 //AES final-1 block - store result 3986 3987 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 3988 3989 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 3990 3991 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 3992 mov x7, v0.d[1] //AES final block - mov high 3993 3994 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 3995 mov x6, v0.d[0] //AES final block - mov low 3996 3997 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 3998 3999 movi v8.8b, #0 //suppress further partial tag feed in 4000 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 4001 eor x7, x7, x14 //AES final block - round 12 high 4002 #ifdef __AARCH64EB__ 4003 rev x7, x7 4004 #endif 4005 eor x6, x6, x13 //AES final block - round 12 low 4006 #ifdef __AARCH64EB__ 4007 rev x6, x6 4008 #endif 4009 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 4010 .L192_dec_blocks_less_than_1: //blocks left <= 1 4011 4012 mvn x13, xzr //rk12_l = 0xffffffffffffffff 4013 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 4014 and x1, x1, #127 //bit_length %= 128 4015 4016 sub x1, x1, #128 //bit_length -= 128 4017 4018 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 4019 4020 and x1, x1, #127 //bit_length %= 128 4021 mvn x14, xzr //rk12_h = 0xffffffffffffffff 4022 4023 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 4024 cmp x1, #64 4025 4026 csel x9, x13, x14, lt 4027 csel x10, x14, xzr, lt 4028 4029 fmov d0, x9 //ctr0b is mask for last block 4030 and x6, x6, x9 4031 bic x4, x4, x9 //mask out low existing bytes 4032 4033 orr x6, x6, x4 4034 mov v0.d[1], x10 4035 #ifndef __AARCH64EB__ 4036 rev w9, w12 4037 #else 4038 mov w9, w12 4039 #endif 4040 4041 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 4042 str w9, [x16, #12] //store the updated counter 4043 4044 rev64 v4.16b, v5.16b //GHASH final block 4045 4046 eor v4.16b, v4.16b, v8.16b //feed in partial tag 4047 bic x5, x5, x10 //mask out high existing bytes 4048 4049 and x7, x7, x10 4050 4051 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 4052 mov d8, v4.d[1] //GHASH final block - mid 4053 4054 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 4055 4056 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 4057 4058 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 4059 4060 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 4061 4062 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 4063 4064 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 4065 movi v8.8b, #0xc2 4066 4067 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4068 4069 shl d8, d8, #56 //mod_constant 4070 4071 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 4072 4073 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4074 orr x7, x7, x5 4075 stp x6, x7, [x2] 4076 4077 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4078 4079 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 4080 4081 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 4082 4083 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4084 4085 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 4086 4087 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4088 4089 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4090 ext v11.16b, v11.16b, v11.16b, #8 4091 rev64 v11.16b, v11.16b 4092 mov x0, x15 4093 st1 { v11.16b }, [x3] 4094 4095 ldp x21, x22, [sp, #16] 4096 ldp x23, x24, [sp, #32] 4097 ldp d8, d9, [sp, #48] 4098 ldp d10, d11, [sp, #64] 4099 ldp d12, d13, [sp, #80] 4100 ldp d14, d15, [sp, #96] 4101 ldp x19, x20, [sp], #112 4102 ret 4103 4104 .L192_dec_ret: 4105 mov w0, #0x0 4106 ret 4107 .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 4108 .globl aes_gcm_enc_256_kernel 4109 .type aes_gcm_enc_256_kernel,%function 4110 .align 4 4111 aes_gcm_enc_256_kernel: 4112 AARCH64_VALID_CALL_TARGET 4113 cbz x1, .L256_enc_ret 4114 stp x19, x20, [sp, #-112]! 4115 mov x16, x4 4116 mov x8, x5 4117 stp x21, x22, [sp, #16] 4118 stp x23, x24, [sp, #32] 4119 stp d8, d9, [sp, #48] 4120 stp d10, d11, [sp, #64] 4121 stp d12, d13, [sp, #80] 4122 stp d14, d15, [sp, #96] 4123 4124 add x4, x0, x1, lsr #3 //end_input_ptr 4125 lsr x5, x1, #3 //byte_len 4126 mov x15, x5 4127 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 4128 #ifdef __AARCH64EB__ 4129 rev x10, x10 4130 rev x11, x11 4131 #endif 4132 ldp x13, x14, [x8, #224] //load rk14 4133 #ifdef __AARCH64EB__ 4134 ror x13, x13, #32 4135 ror x14, x14, #32 4136 #endif 4137 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 4138 sub x5, x5, #1 //byte_len - 1 4139 4140 ld1 {v18.4s}, [x8], #16 //load rk0 4141 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4142 4143 ld1 {v19.4s}, [x8], #16 //load rk1 4144 add x5, x5, x0 4145 4146 lsr x12, x11, #32 4147 fmov d2, x10 //CTR block 2 4148 orr w11, w11, w11 4149 4150 rev w12, w12 //rev_ctr32 4151 cmp x0, x5 //check if we have <= 4 blocks 4152 fmov d1, x10 //CTR block 1 4153 4154 aese v0.16b, v18.16b 4155 aesmc v0.16b, v0.16b //AES block 0 - round 0 4156 add w12, w12, #1 //increment rev_ctr32 4157 4158 rev w9, w12 //CTR block 1 4159 fmov d3, x10 //CTR block 3 4160 4161 orr x9, x11, x9, lsl #32 //CTR block 1 4162 add w12, w12, #1 //CTR block 1 4163 ld1 {v20.4s}, [x8], #16 //load rk2 4164 4165 fmov v1.d[1], x9 //CTR block 1 4166 rev w9, w12 //CTR block 2 4167 add w12, w12, #1 //CTR block 2 4168 4169 orr x9, x11, x9, lsl #32 //CTR block 2 4170 ld1 {v21.4s}, [x8], #16 //load rk3 4171 4172 fmov v2.d[1], x9 //CTR block 2 4173 rev w9, w12 //CTR block 3 4174 4175 aese v0.16b, v19.16b 4176 aesmc v0.16b, v0.16b //AES block 0 - round 1 4177 orr x9, x11, x9, lsl #32 //CTR block 3 4178 4179 fmov v3.d[1], x9 //CTR block 3 4180 4181 aese v1.16b, v18.16b 4182 aesmc v1.16b, v1.16b //AES block 1 - round 0 4183 ld1 {v22.4s}, [x8], #16 //load rk4 4184 4185 aese v0.16b, v20.16b 4186 aesmc v0.16b, v0.16b //AES block 0 - round 2 4187 ld1 {v23.4s}, [x8], #16 //load rk5 4188 4189 aese v2.16b, v18.16b 4190 aesmc v2.16b, v2.16b //AES block 2 - round 0 4191 ld1 {v24.4s}, [x8], #16 //load rk6 4192 4193 aese v1.16b, v19.16b 4194 aesmc v1.16b, v1.16b //AES block 1 - round 1 4195 ldr q14, [x3, #80] //load h3l | h3h 4196 #ifndef __AARCH64EB__ 4197 ext v14.16b, v14.16b, v14.16b, #8 4198 #endif 4199 aese v3.16b, v18.16b 4200 aesmc v3.16b, v3.16b //AES block 3 - round 0 4201 ld1 {v25.4s}, [x8], #16 //load rk7 4202 4203 aese v2.16b, v19.16b 4204 aesmc v2.16b, v2.16b //AES block 2 - round 1 4205 ld1 {v26.4s}, [x8], #16 //load rk8 4206 4207 aese v1.16b, v20.16b 4208 aesmc v1.16b, v1.16b //AES block 1 - round 2 4209 ldr q13, [x3, #64] //load h2l | h2h 4210 #ifndef __AARCH64EB__ 4211 ext v13.16b, v13.16b, v13.16b, #8 4212 #endif 4213 aese v3.16b, v19.16b 4214 aesmc v3.16b, v3.16b //AES block 3 - round 1 4215 ld1 {v27.4s}, [x8], #16 //load rk9 4216 4217 aese v2.16b, v20.16b 4218 aesmc v2.16b, v2.16b //AES block 2 - round 2 4219 ldr q15, [x3, #112] //load h4l | h4h 4220 #ifndef __AARCH64EB__ 4221 ext v15.16b, v15.16b, v15.16b, #8 4222 #endif 4223 aese v1.16b, v21.16b 4224 aesmc v1.16b, v1.16b //AES block 1 - round 3 4225 ld1 {v28.4s}, [x8], #16 //load rk10 4226 4227 aese v3.16b, v20.16b 4228 aesmc v3.16b, v3.16b //AES block 3 - round 2 4229 ld1 {v29.4s}, [x8], #16 //load rk11 4230 4231 aese v2.16b, v21.16b 4232 aesmc v2.16b, v2.16b //AES block 2 - round 3 4233 add w12, w12, #1 //CTR block 3 4234 4235 aese v0.16b, v21.16b 4236 aesmc v0.16b, v0.16b //AES block 0 - round 3 4237 4238 aese v3.16b, v21.16b 4239 aesmc v3.16b, v3.16b //AES block 3 - round 3 4240 ld1 { v11.16b}, [x3] 4241 ext v11.16b, v11.16b, v11.16b, #8 4242 rev64 v11.16b, v11.16b 4243 4244 aese v2.16b, v22.16b 4245 aesmc v2.16b, v2.16b //AES block 2 - round 4 4246 4247 aese v0.16b, v22.16b 4248 aesmc v0.16b, v0.16b //AES block 0 - round 4 4249 4250 aese v1.16b, v22.16b 4251 aesmc v1.16b, v1.16b //AES block 1 - round 4 4252 4253 aese v3.16b, v22.16b 4254 aesmc v3.16b, v3.16b //AES block 3 - round 4 4255 4256 aese v0.16b, v23.16b 4257 aesmc v0.16b, v0.16b //AES block 0 - round 5 4258 4259 aese v1.16b, v23.16b 4260 aesmc v1.16b, v1.16b //AES block 1 - round 5 4261 4262 aese v3.16b, v23.16b 4263 aesmc v3.16b, v3.16b //AES block 3 - round 5 4264 4265 aese v2.16b, v23.16b 4266 aesmc v2.16b, v2.16b //AES block 2 - round 5 4267 4268 aese v1.16b, v24.16b 4269 aesmc v1.16b, v1.16b //AES block 1 - round 6 4270 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 4271 4272 aese v3.16b, v24.16b 4273 aesmc v3.16b, v3.16b //AES block 3 - round 6 4274 ld1 {v30.4s}, [x8], #16 //load rk12 4275 4276 aese v0.16b, v24.16b 4277 aesmc v0.16b, v0.16b //AES block 0 - round 6 4278 ldr q12, [x3, #32] //load h1l | h1h 4279 #ifndef __AARCH64EB__ 4280 ext v12.16b, v12.16b, v12.16b, #8 4281 #endif 4282 aese v2.16b, v24.16b 4283 aesmc v2.16b, v2.16b //AES block 2 - round 6 4284 ld1 {v31.4s}, [x8], #16 //load rk13 4285 4286 aese v1.16b, v25.16b 4287 aesmc v1.16b, v1.16b //AES block 1 - round 7 4288 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 4289 4290 aese v0.16b, v25.16b 4291 aesmc v0.16b, v0.16b //AES block 0 - round 7 4292 4293 aese v2.16b, v25.16b 4294 aesmc v2.16b, v2.16b //AES block 2 - round 7 4295 4296 aese v3.16b, v25.16b 4297 aesmc v3.16b, v3.16b //AES block 3 - round 7 4298 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 4299 4300 aese v1.16b, v26.16b 4301 aesmc v1.16b, v1.16b //AES block 1 - round 8 4302 4303 aese v2.16b, v26.16b 4304 aesmc v2.16b, v2.16b //AES block 2 - round 8 4305 4306 aese v3.16b, v26.16b 4307 aesmc v3.16b, v3.16b //AES block 3 - round 8 4308 4309 aese v1.16b, v27.16b 4310 aesmc v1.16b, v1.16b //AES block 1 - round 9 4311 4312 aese v2.16b, v27.16b 4313 aesmc v2.16b, v2.16b //AES block 2 - round 9 4314 4315 aese v0.16b, v26.16b 4316 aesmc v0.16b, v0.16b //AES block 0 - round 8 4317 4318 aese v1.16b, v28.16b 4319 aesmc v1.16b, v1.16b //AES block 1 - round 10 4320 4321 aese v3.16b, v27.16b 4322 aesmc v3.16b, v3.16b //AES block 3 - round 9 4323 4324 aese v0.16b, v27.16b 4325 aesmc v0.16b, v0.16b //AES block 0 - round 9 4326 4327 aese v2.16b, v28.16b 4328 aesmc v2.16b, v2.16b //AES block 2 - round 10 4329 4330 aese v3.16b, v28.16b 4331 aesmc v3.16b, v3.16b //AES block 3 - round 10 4332 4333 aese v1.16b, v29.16b 4334 aesmc v1.16b, v1.16b //AES block 1 - round 11 4335 4336 aese v2.16b, v29.16b 4337 aesmc v2.16b, v2.16b //AES block 2 - round 11 4338 4339 aese v0.16b, v28.16b 4340 aesmc v0.16b, v0.16b //AES block 0 - round 10 4341 4342 aese v1.16b, v30.16b 4343 aesmc v1.16b, v1.16b //AES block 1 - round 12 4344 4345 aese v2.16b, v30.16b 4346 aesmc v2.16b, v2.16b //AES block 2 - round 12 4347 4348 aese v0.16b, v29.16b 4349 aesmc v0.16b, v0.16b //AES block 0 - round 11 4350 eor v17.16b, v17.16b, v9.16b //h4k | h3k 4351 4352 aese v3.16b, v29.16b 4353 aesmc v3.16b, v3.16b //AES block 3 - round 11 4354 4355 aese v2.16b, v31.16b //AES block 2 - round 13 4356 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 4357 4358 aese v0.16b, v30.16b 4359 aesmc v0.16b, v0.16b //AES block 0 - round 12 4360 4361 aese v3.16b, v30.16b 4362 aesmc v3.16b, v3.16b //AES block 3 - round 12 4363 4364 aese v1.16b, v31.16b //AES block 1 - round 13 4365 4366 aese v0.16b, v31.16b //AES block 0 - round 13 4367 4368 aese v3.16b, v31.16b //AES block 3 - round 13 4369 eor v16.16b, v16.16b, v8.16b //h2k | h1k 4370 b.ge .L256_enc_tail //handle tail 4371 4372 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 4373 #ifdef __AARCH64EB__ 4374 rev x19, x19 4375 rev x20, x20 4376 #endif 4377 rev w9, w12 //CTR block 4 4378 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 4379 #ifdef __AARCH64EB__ 4380 rev x6, x6 4381 rev x7, x7 4382 #endif 4383 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 4384 #ifdef __AARCH64EB__ 4385 rev x23, x23 4386 rev x24, x24 4387 #endif 4388 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 4389 #ifdef __AARCH64EB__ 4390 rev x21, x21 4391 rev x22, x22 4392 #endif 4393 add x0, x0, #64 //AES input_ptr update 4394 4395 eor x19, x19, x13 //AES block 1 - round 14 low 4396 eor x20, x20, x14 //AES block 1 - round 14 high 4397 4398 fmov d5, x19 //AES block 1 - mov low 4399 eor x6, x6, x13 //AES block 0 - round 14 low 4400 4401 eor x7, x7, x14 //AES block 0 - round 14 high 4402 eor x24, x24, x14 //AES block 3 - round 14 high 4403 fmov d4, x6 //AES block 0 - mov low 4404 4405 cmp x0, x5 //check if we have <= 8 blocks 4406 fmov v4.d[1], x7 //AES block 0 - mov high 4407 eor x23, x23, x13 //AES block 3 - round 14 low 4408 4409 eor x21, x21, x13 //AES block 2 - round 14 low 4410 fmov v5.d[1], x20 //AES block 1 - mov high 4411 4412 fmov d6, x21 //AES block 2 - mov low 4413 add w12, w12, #1 //CTR block 4 4414 4415 orr x9, x11, x9, lsl #32 //CTR block 4 4416 fmov d7, x23 //AES block 3 - mov low 4417 eor x22, x22, x14 //AES block 2 - round 14 high 4418 4419 fmov v6.d[1], x22 //AES block 2 - mov high 4420 4421 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 4422 fmov d0, x10 //CTR block 4 4423 4424 fmov v0.d[1], x9 //CTR block 4 4425 rev w9, w12 //CTR block 5 4426 add w12, w12, #1 //CTR block 5 4427 4428 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 4429 fmov d1, x10 //CTR block 5 4430 orr x9, x11, x9, lsl #32 //CTR block 5 4431 4432 fmov v1.d[1], x9 //CTR block 5 4433 rev w9, w12 //CTR block 6 4434 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 4435 4436 fmov v7.d[1], x24 //AES block 3 - mov high 4437 orr x9, x11, x9, lsl #32 //CTR block 6 4438 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 4439 4440 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 4441 4442 add w12, w12, #1 //CTR block 6 4443 fmov d2, x10 //CTR block 6 4444 4445 fmov v2.d[1], x9 //CTR block 6 4446 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 4447 rev w9, w12 //CTR block 7 4448 4449 orr x9, x11, x9, lsl #32 //CTR block 7 4450 4451 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 4452 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 4453 b.ge .L256_enc_prepretail //do prepretail 4454 4455 .L256_enc_main_loop: //main loop start 4456 aese v0.16b, v18.16b 4457 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4458 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4459 4460 aese v1.16b, v18.16b 4461 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4462 fmov d3, x10 //CTR block 4k+3 4463 4464 aese v2.16b, v18.16b 4465 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4466 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4467 4468 aese v0.16b, v19.16b 4469 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4470 fmov v3.d[1], x9 //CTR block 4k+3 4471 4472 aese v1.16b, v19.16b 4473 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4474 ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext 4475 #ifdef __AARCH64EB__ 4476 rev x23, x23 4477 rev x24, x24 4478 #endif 4479 aese v2.16b, v19.16b 4480 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4481 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 4482 #ifdef __AARCH64EB__ 4483 rev x21, x21 4484 rev x22, x22 4485 #endif 4486 aese v0.16b, v20.16b 4487 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4488 eor v4.16b, v4.16b, v11.16b //PRE 1 4489 4490 aese v1.16b, v20.16b 4491 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4492 4493 aese v3.16b, v18.16b 4494 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4495 eor x23, x23, x13 //AES block 4k+7 - round 14 low 4496 4497 aese v0.16b, v21.16b 4498 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4499 mov d10, v17.d[1] //GHASH block 4k - mid 4500 4501 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4502 eor x22, x22, x14 //AES block 4k+6 - round 14 high 4503 mov d8, v4.d[1] //GHASH block 4k - mid 4504 4505 aese v3.16b, v19.16b 4506 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4507 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4508 4509 aese v0.16b, v22.16b 4510 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4511 4512 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4513 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4514 4515 aese v2.16b, v20.16b 4516 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4517 4518 aese v0.16b, v23.16b 4519 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4520 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4521 4522 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4523 4524 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4525 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4526 4527 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4528 4529 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4530 mov d4, v5.d[1] //GHASH block 4k+1 - mid 4531 4532 aese v1.16b, v21.16b 4533 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4534 4535 aese v3.16b, v20.16b 4536 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4537 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4538 4539 aese v2.16b, v21.16b 4540 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4541 4542 aese v1.16b, v22.16b 4543 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4544 mov d8, v6.d[1] //GHASH block 4k+2 - mid 4545 4546 aese v3.16b, v21.16b 4547 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4548 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4549 4550 aese v2.16b, v22.16b 4551 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4552 4553 aese v0.16b, v24.16b 4554 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4555 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4556 4557 aese v3.16b, v22.16b 4558 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4559 4560 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4561 4562 aese v0.16b, v25.16b 4563 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4564 4565 aese v3.16b, v23.16b 4566 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4567 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4568 4569 aese v1.16b, v23.16b 4570 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4571 4572 aese v0.16b, v26.16b 4573 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4574 4575 aese v2.16b, v23.16b 4576 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4577 4578 aese v1.16b, v24.16b 4579 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4580 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4581 4582 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4583 4584 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4585 4586 aese v1.16b, v25.16b 4587 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4588 4589 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4590 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4591 4592 aese v3.16b, v24.16b 4593 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4594 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 4595 #ifdef __AARCH64EB__ 4596 rev x19, x19 4597 rev x20, x20 4598 #endif 4599 aese v1.16b, v26.16b 4600 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4601 mov d4, v7.d[1] //GHASH block 4k+3 - mid 4602 4603 aese v2.16b, v24.16b 4604 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4605 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4606 4607 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4608 4609 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4610 eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4611 4612 aese v2.16b, v25.16b 4613 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4614 eor x19, x19, x13 //AES block 4k+5 - round 14 low 4615 4616 aese v1.16b, v27.16b 4617 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4618 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4619 4620 aese v3.16b, v25.16b 4621 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4622 eor x21, x21, x13 //AES block 4k+6 - round 14 low 4623 4624 aese v0.16b, v27.16b 4625 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4626 movi v8.8b, #0xc2 4627 4628 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4629 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4630 fmov d5, x19 //AES block 4k+5 - mov low 4631 4632 aese v2.16b, v26.16b 4633 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4634 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 4635 #ifdef __AARCH64EB__ 4636 rev x6, x6 4637 rev x7, x7 4638 #endif 4639 aese v0.16b, v28.16b 4640 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4641 shl d8, d8, #56 //mod_constant 4642 4643 aese v3.16b, v26.16b 4644 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4645 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4646 4647 aese v2.16b, v27.16b 4648 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4649 4650 aese v1.16b, v28.16b 4651 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4652 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4653 4654 aese v3.16b, v27.16b 4655 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4656 add w12, w12, #1 //CTR block 4k+3 4657 4658 aese v0.16b, v29.16b 4659 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4660 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4661 4662 aese v1.16b, v29.16b 4663 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4664 add x0, x0, #64 //AES input_ptr update 4665 4666 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4667 rev w9, w12 //CTR block 4k+8 4668 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4669 4670 aese v2.16b, v28.16b 4671 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4672 eor x6, x6, x13 //AES block 4k+4 - round 14 low 4673 4674 aese v1.16b, v30.16b 4675 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4676 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 4677 4678 aese v3.16b, v28.16b 4679 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4680 eor x7, x7, x14 //AES block 4k+4 - round 14 high 4681 4682 fmov d4, x6 //AES block 4k+4 - mov low 4683 orr x9, x11, x9, lsl #32 //CTR block 4k+8 4684 eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid 4685 4686 aese v0.16b, v30.16b 4687 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4688 eor x20, x20, x14 //AES block 4k+5 - round 14 high 4689 4690 aese v2.16b, v29.16b 4691 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4692 eor x24, x24, x14 //AES block 4k+7 - round 14 high 4693 4694 aese v3.16b, v29.16b 4695 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4696 add w12, w12, #1 //CTR block 4k+8 4697 4698 aese v0.16b, v31.16b //AES block 4k+4 - round 13 4699 fmov v4.d[1], x7 //AES block 4k+4 - mov high 4700 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 4701 4702 aese v2.16b, v30.16b 4703 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4704 fmov d7, x23 //AES block 4k+7 - mov low 4705 4706 aese v1.16b, v31.16b //AES block 4k+5 - round 13 4707 fmov v5.d[1], x20 //AES block 4k+5 - mov high 4708 4709 fmov d6, x21 //AES block 4k+6 - mov low 4710 cmp x0, x5 //.LOOP CONTROL 4711 4712 fmov v6.d[1], x22 //AES block 4k+6 - mov high 4713 4714 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4715 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 4716 fmov d0, x10 //CTR block 4k+8 4717 4718 fmov v0.d[1], x9 //CTR block 4k+8 4719 rev w9, w12 //CTR block 4k+9 4720 add w12, w12, #1 //CTR block 4k+9 4721 4722 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 4723 fmov d1, x10 //CTR block 4k+9 4724 orr x9, x11, x9, lsl #32 //CTR block 4k+9 4725 4726 aese v3.16b, v30.16b 4727 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4728 fmov v1.d[1], x9 //CTR block 4k+9 4729 4730 aese v2.16b, v31.16b //AES block 4k+6 - round 13 4731 rev w9, w12 //CTR block 4k+10 4732 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 4733 4734 orr x9, x11, x9, lsl #32 //CTR block 4k+10 4735 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 4736 fmov v7.d[1], x24 //AES block 4k+7 - mov high 4737 4738 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4739 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 4740 add w12, w12, #1 //CTR block 4k+10 4741 4742 aese v3.16b, v31.16b //AES block 4k+7 - round 13 4743 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 4744 fmov d2, x10 //CTR block 4k+10 4745 4746 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 4747 fmov v2.d[1], x9 //CTR block 4k+10 4748 rev w9, w12 //CTR block 4k+11 4749 4750 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4751 orr x9, x11, x9, lsl #32 //CTR block 4k+11 4752 4753 eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result 4754 st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result 4755 b.lt .L256_enc_main_loop 4756 4757 .L256_enc_prepretail: //PREPRETAIL 4758 aese v1.16b, v18.16b 4759 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4760 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4761 4762 aese v2.16b, v18.16b 4763 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4764 fmov d3, x10 //CTR block 4k+3 4765 4766 aese v0.16b, v18.16b 4767 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4768 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4769 4770 fmov v3.d[1], x9 //CTR block 4k+3 4771 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4772 4773 aese v2.16b, v19.16b 4774 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4775 4776 aese v0.16b, v19.16b 4777 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4778 4779 eor v4.16b, v4.16b, v11.16b //PRE 1 4780 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4781 4782 aese v2.16b, v20.16b 4783 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4784 4785 aese v3.16b, v18.16b 4786 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4787 mov d10, v17.d[1] //GHASH block 4k - mid 4788 4789 aese v1.16b, v19.16b 4790 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4791 4792 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4793 mov d8, v4.d[1] //GHASH block 4k - mid 4794 4795 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4796 4797 aese v2.16b, v21.16b 4798 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4799 4800 aese v1.16b, v20.16b 4801 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4802 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4803 4804 aese v0.16b, v20.16b 4805 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4806 4807 aese v3.16b, v19.16b 4808 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4809 4810 aese v1.16b, v21.16b 4811 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4812 4813 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4814 4815 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4816 4817 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4818 4819 aese v3.16b, v20.16b 4820 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4821 4822 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4823 mov d4, v5.d[1] //GHASH block 4k+1 - mid 4824 4825 aese v0.16b, v21.16b 4826 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4827 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4828 4829 aese v3.16b, v21.16b 4830 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4831 4832 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4833 mov d8, v6.d[1] //GHASH block 4k+2 - mid 4834 4835 aese v0.16b, v22.16b 4836 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4837 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4838 4839 aese v3.16b, v22.16b 4840 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4841 4842 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4843 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4844 add w12, w12, #1 //CTR block 4k+3 4845 4846 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4847 4848 aese v3.16b, v23.16b 4849 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4850 4851 aese v2.16b, v22.16b 4852 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4853 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4854 4855 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4856 4857 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4858 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4859 4860 aese v2.16b, v23.16b 4861 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4862 4863 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4864 mov d4, v7.d[1] //GHASH block 4k+3 - mid 4865 4866 aese v1.16b, v22.16b 4867 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4868 4869 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4870 4871 eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4872 4873 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4874 4875 aese v1.16b, v23.16b 4876 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4877 4878 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4879 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4880 4881 aese v0.16b, v23.16b 4882 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4883 4884 aese v1.16b, v24.16b 4885 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4886 4887 aese v2.16b, v24.16b 4888 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4889 4890 aese v0.16b, v24.16b 4891 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4892 movi v8.8b, #0xc2 4893 4894 aese v3.16b, v24.16b 4895 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4896 4897 aese v1.16b, v25.16b 4898 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4899 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4900 4901 aese v0.16b, v25.16b 4902 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4903 4904 aese v3.16b, v25.16b 4905 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4906 shl d8, d8, #56 //mod_constant 4907 4908 aese v1.16b, v26.16b 4909 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4910 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4911 4912 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4913 4914 aese v3.16b, v26.16b 4915 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4916 4917 aese v1.16b, v27.16b 4918 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4919 4920 aese v0.16b, v26.16b 4921 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4922 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4923 4924 aese v3.16b, v27.16b 4925 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4926 4927 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 4928 4929 pmull v4.1q, v9.1d, v8.1d 4930 ext v9.16b, v9.16b, v9.16b, #8 4931 4932 aese v3.16b, v28.16b 4933 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4934 4935 aese v2.16b, v25.16b 4936 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4937 eor v10.16b, v10.16b, v11.16b 4938 4939 aese v1.16b, v28.16b 4940 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4941 4942 aese v0.16b, v27.16b 4943 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4944 4945 aese v2.16b, v26.16b 4946 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4947 4948 aese v1.16b, v29.16b 4949 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4950 eor v10.16b, v10.16b, v4.16b 4951 4952 aese v0.16b, v28.16b 4953 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4954 4955 aese v2.16b, v27.16b 4956 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4957 4958 aese v1.16b, v30.16b 4959 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4960 4961 aese v0.16b, v29.16b 4962 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4963 eor v10.16b, v10.16b, v9.16b 4964 4965 aese v3.16b, v29.16b 4966 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4967 4968 aese v2.16b, v28.16b 4969 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4970 4971 aese v0.16b, v30.16b 4972 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4973 4974 pmull v4.1q, v10.1d, v8.1d 4975 4976 aese v2.16b, v29.16b 4977 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4978 ext v10.16b, v10.16b, v10.16b, #8 4979 4980 aese v3.16b, v30.16b 4981 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4982 4983 aese v1.16b, v31.16b //AES block 4k+5 - round 13 4984 eor v11.16b, v11.16b, v4.16b 4985 4986 aese v2.16b, v30.16b 4987 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4988 4989 aese v3.16b, v31.16b //AES block 4k+7 - round 13 4990 4991 aese v0.16b, v31.16b //AES block 4k+4 - round 13 4992 4993 aese v2.16b, v31.16b //AES block 4k+6 - round 13 4994 eor v11.16b, v11.16b, v10.16b 4995 .L256_enc_tail: //TAIL 4996 4997 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 4998 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 4999 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 5000 #ifdef __AARCH64EB__ 5001 rev x6, x6 5002 rev x7, x7 5003 #endif 5004 eor x6, x6, x13 //AES block 4k+4 - round 14 low 5005 eor x7, x7, x14 //AES block 4k+4 - round 14 high 5006 5007 cmp x5, #48 5008 fmov d4, x6 //AES block 4k+4 - mov low 5009 5010 fmov v4.d[1], x7 //AES block 4k+4 - mov high 5011 5012 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 5013 b.gt .L256_enc_blocks_more_than_3 5014 5015 cmp x5, #32 5016 mov v3.16b, v2.16b 5017 movi v11.8b, #0 5018 5019 movi v9.8b, #0 5020 sub w12, w12, #1 5021 5022 mov v2.16b, v1.16b 5023 movi v10.8b, #0 5024 b.gt .L256_enc_blocks_more_than_2 5025 5026 mov v3.16b, v1.16b 5027 sub w12, w12, #1 5028 cmp x5, #16 5029 5030 b.gt .L256_enc_blocks_more_than_1 5031 5032 sub w12, w12, #1 5033 b .L256_enc_blocks_less_than_1 5034 .L256_enc_blocks_more_than_3: //blocks left > 3 5035 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 5036 5037 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 5038 #ifdef __AARCH64EB__ 5039 rev x6, x6 5040 rev x7, x7 5041 #endif 5042 rev64 v4.16b, v5.16b //GHASH final-3 block 5043 5044 eor x6, x6, x13 //AES final-2 block - round 14 low 5045 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5046 5047 eor x7, x7, x14 //AES final-2 block - round 14 high 5048 5049 mov d22, v4.d[1] //GHASH final-3 block - mid 5050 fmov d5, x6 //AES final-2 block - mov low 5051 5052 fmov v5.d[1], x7 //AES final-2 block - mov high 5053 5054 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 5055 movi v8.8b, #0 //suppress further partial tag feed in 5056 5057 mov d10, v17.d[1] //GHASH final-3 block - mid 5058 5059 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 5060 5061 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 5062 5063 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 5064 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 5065 .L256_enc_blocks_more_than_2: //blocks left > 2 5066 5067 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 5068 5069 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 5070 #ifdef __AARCH64EB__ 5071 rev x6, x6 5072 rev x7, x7 5073 #endif 5074 rev64 v4.16b, v5.16b //GHASH final-2 block 5075 5076 eor x6, x6, x13 //AES final-1 block - round 14 low 5077 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5078 5079 fmov d5, x6 //AES final-1 block - mov low 5080 eor x7, x7, x14 //AES final-1 block - round 14 high 5081 5082 fmov v5.d[1], x7 //AES final-1 block - mov high 5083 5084 movi v8.8b, #0 //suppress further partial tag feed in 5085 5086 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 5087 mov d22, v4.d[1] //GHASH final-2 block - mid 5088 5089 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 5090 5091 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 5092 5093 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 5094 5095 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 5096 5097 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 5098 5099 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 5100 5101 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 5102 .L256_enc_blocks_more_than_1: //blocks left > 1 5103 5104 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 5105 5106 rev64 v4.16b, v5.16b //GHASH final-1 block 5107 5108 ldp x6, x7, [x0], #16 //AES final block - load input low & high 5109 #ifdef __AARCH64EB__ 5110 rev x6, x6 5111 rev x7, x7 5112 #endif 5113 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5114 5115 movi v8.8b, #0 //suppress further partial tag feed in 5116 5117 eor x6, x6, x13 //AES final block - round 14 low 5118 mov d22, v4.d[1] //GHASH final-1 block - mid 5119 5120 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 5121 eor x7, x7, x14 //AES final block - round 14 high 5122 5123 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 5124 5125 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 5126 5127 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 5128 fmov d5, x6 //AES final block - mov low 5129 5130 fmov v5.d[1], x7 //AES final block - mov high 5131 5132 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 5133 5134 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 5135 5136 eor v5.16b, v5.16b, v3.16b //AES final block - result 5137 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 5138 5139 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 5140 .L256_enc_blocks_less_than_1: //blocks left <= 1 5141 5142 and x1, x1, #127 //bit_length %= 128 5143 5144 mvn x13, xzr //rk14_l = 0xffffffffffffffff 5145 sub x1, x1, #128 //bit_length -= 128 5146 5147 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 5148 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 5149 5150 mvn x14, xzr //rk14_h = 0xffffffffffffffff 5151 and x1, x1, #127 //bit_length %= 128 5152 5153 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 5154 cmp x1, #64 5155 5156 csel x6, x13, x14, lt 5157 csel x7, x14, xzr, lt 5158 5159 fmov d0, x6 //ctr0b is mask for last block 5160 5161 fmov v0.d[1], x7 5162 5163 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 5164 5165 rev64 v4.16b, v5.16b //GHASH final block 5166 5167 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5168 5169 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 5170 5171 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 5172 mov d8, v4.d[1] //GHASH final block - mid 5173 #ifndef __AARCH64EB__ 5174 rev w9, w12 5175 #else 5176 mov w9, w12 5177 #endif 5178 5179 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 5180 5181 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 5182 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 5183 5184 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 5185 5186 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 5187 5188 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 5189 movi v8.8b, #0xc2 5190 5191 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5192 5193 shl d8, d8, #56 //mod_constant 5194 5195 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 5196 5197 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5198 5199 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5200 5201 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5202 5203 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5204 5205 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5206 5207 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5208 5209 str w9, [x16, #12] //store the updated counter 5210 5211 st1 { v5.16b}, [x2] //store all 16B 5212 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 5213 5214 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5215 ext v11.16b, v11.16b, v11.16b, #8 5216 rev64 v11.16b, v11.16b 5217 mov x0, x15 5218 st1 { v11.16b }, [x3] 5219 5220 ldp x21, x22, [sp, #16] 5221 ldp x23, x24, [sp, #32] 5222 ldp d8, d9, [sp, #48] 5223 ldp d10, d11, [sp, #64] 5224 ldp d12, d13, [sp, #80] 5225 ldp d14, d15, [sp, #96] 5226 ldp x19, x20, [sp], #112 5227 ret 5228 5229 .L256_enc_ret: 5230 mov w0, #0x0 5231 ret 5232 .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5233 .globl aes_gcm_dec_256_kernel 5234 .type aes_gcm_dec_256_kernel,%function 5235 .align 4 5236 aes_gcm_dec_256_kernel: 5237 AARCH64_VALID_CALL_TARGET 5238 cbz x1, .L256_dec_ret 5239 stp x19, x20, [sp, #-112]! 5240 mov x16, x4 5241 mov x8, x5 5242 stp x21, x22, [sp, #16] 5243 stp x23, x24, [sp, #32] 5244 stp d8, d9, [sp, #48] 5245 stp d10, d11, [sp, #64] 5246 stp d12, d13, [sp, #80] 5247 stp d14, d15, [sp, #96] 5248 5249 lsr x5, x1, #3 //byte_len 5250 mov x15, x5 5251 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 5252 #ifdef __AARCH64EB__ 5253 rev x10, x10 5254 rev x11, x11 5255 #endif 5256 ldp x13, x14, [x8, #224] //load rk14 5257 #ifdef __AARCH64EB__ 5258 ror x14, x14, #32 5259 ror x13, x13, #32 5260 #endif 5261 ld1 {v18.4s}, [x8], #16 //load rk0 5262 sub x5, x5, #1 //byte_len - 1 5263 5264 ld1 {v19.4s}, [x8], #16 //load rk1 5265 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5266 5267 add x4, x0, x1, lsr #3 //end_input_ptr 5268 ld1 {v20.4s}, [x8], #16 //load rk2 5269 5270 lsr x12, x11, #32 5271 ld1 {v21.4s}, [x8], #16 //load rk3 5272 orr w11, w11, w11 5273 5274 ld1 {v22.4s}, [x8], #16 //load rk4 5275 add x5, x5, x0 5276 rev w12, w12 //rev_ctr32 5277 5278 add w12, w12, #1 //increment rev_ctr32 5279 fmov d3, x10 //CTR block 3 5280 5281 rev w9, w12 //CTR block 1 5282 add w12, w12, #1 //CTR block 1 5283 fmov d1, x10 //CTR block 1 5284 5285 orr x9, x11, x9, lsl #32 //CTR block 1 5286 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 5287 5288 fmov v1.d[1], x9 //CTR block 1 5289 rev w9, w12 //CTR block 2 5290 add w12, w12, #1 //CTR block 2 5291 5292 fmov d2, x10 //CTR block 2 5293 orr x9, x11, x9, lsl #32 //CTR block 2 5294 5295 fmov v2.d[1], x9 //CTR block 2 5296 rev w9, w12 //CTR block 3 5297 5298 orr x9, x11, x9, lsl #32 //CTR block 3 5299 ld1 {v23.4s}, [x8], #16 //load rk5 5300 5301 fmov v3.d[1], x9 //CTR block 3 5302 add w12, w12, #1 //CTR block 3 5303 5304 ld1 {v24.4s}, [x8], #16 //load rk6 5305 5306 ld1 {v25.4s}, [x8], #16 //load rk7 5307 5308 ld1 {v26.4s}, [x8], #16 //load rk8 5309 5310 aese v0.16b, v18.16b 5311 aesmc v0.16b, v0.16b //AES block 0 - round 0 5312 ldr q14, [x3, #80] //load h3l | h3h 5313 #ifndef __AARCH64EB__ 5314 ext v14.16b, v14.16b, v14.16b, #8 5315 #endif 5316 5317 aese v3.16b, v18.16b 5318 aesmc v3.16b, v3.16b //AES block 3 - round 0 5319 ldr q15, [x3, #112] //load h4l | h4h 5320 #ifndef __AARCH64EB__ 5321 ext v15.16b, v15.16b, v15.16b, #8 5322 #endif 5323 5324 aese v1.16b, v18.16b 5325 aesmc v1.16b, v1.16b //AES block 1 - round 0 5326 ldr q13, [x3, #64] //load h2l | h2h 5327 #ifndef __AARCH64EB__ 5328 ext v13.16b, v13.16b, v13.16b, #8 5329 #endif 5330 5331 aese v2.16b, v18.16b 5332 aesmc v2.16b, v2.16b //AES block 2 - round 0 5333 ld1 {v27.4s}, [x8], #16 //load rk9 5334 5335 aese v0.16b, v19.16b 5336 aesmc v0.16b, v0.16b //AES block 0 - round 1 5337 5338 aese v1.16b, v19.16b 5339 aesmc v1.16b, v1.16b //AES block 1 - round 1 5340 ld1 { v11.16b}, [x3] 5341 ext v11.16b, v11.16b, v11.16b, #8 5342 rev64 v11.16b, v11.16b 5343 5344 aese v2.16b, v19.16b 5345 aesmc v2.16b, v2.16b //AES block 2 - round 1 5346 ld1 {v28.4s}, [x8], #16 //load rk10 5347 5348 aese v3.16b, v19.16b 5349 aesmc v3.16b, v3.16b //AES block 3 - round 1 5350 ld1 {v29.4s}, [x8], #16 //load rk11 5351 5352 aese v0.16b, v20.16b 5353 aesmc v0.16b, v0.16b //AES block 0 - round 2 5354 ldr q12, [x3, #32] //load h1l | h1h 5355 #ifndef __AARCH64EB__ 5356 ext v12.16b, v12.16b, v12.16b, #8 5357 #endif 5358 aese v2.16b, v20.16b 5359 aesmc v2.16b, v2.16b //AES block 2 - round 2 5360 ld1 {v30.4s}, [x8], #16 //load rk12 5361 5362 aese v3.16b, v20.16b 5363 aesmc v3.16b, v3.16b //AES block 3 - round 2 5364 5365 aese v0.16b, v21.16b 5366 aesmc v0.16b, v0.16b //AES block 0 - round 3 5367 5368 aese v1.16b, v20.16b 5369 aesmc v1.16b, v1.16b //AES block 1 - round 2 5370 5371 aese v3.16b, v21.16b 5372 aesmc v3.16b, v3.16b //AES block 3 - round 3 5373 5374 aese v0.16b, v22.16b 5375 aesmc v0.16b, v0.16b //AES block 0 - round 4 5376 cmp x0, x5 //check if we have <= 4 blocks 5377 5378 aese v2.16b, v21.16b 5379 aesmc v2.16b, v2.16b //AES block 2 - round 3 5380 5381 aese v1.16b, v21.16b 5382 aesmc v1.16b, v1.16b //AES block 1 - round 3 5383 5384 aese v3.16b, v22.16b 5385 aesmc v3.16b, v3.16b //AES block 3 - round 4 5386 5387 aese v2.16b, v22.16b 5388 aesmc v2.16b, v2.16b //AES block 2 - round 4 5389 5390 aese v1.16b, v22.16b 5391 aesmc v1.16b, v1.16b //AES block 1 - round 4 5392 5393 aese v3.16b, v23.16b 5394 aesmc v3.16b, v3.16b //AES block 3 - round 5 5395 5396 aese v0.16b, v23.16b 5397 aesmc v0.16b, v0.16b //AES block 0 - round 5 5398 5399 aese v1.16b, v23.16b 5400 aesmc v1.16b, v1.16b //AES block 1 - round 5 5401 5402 aese v2.16b, v23.16b 5403 aesmc v2.16b, v2.16b //AES block 2 - round 5 5404 5405 aese v0.16b, v24.16b 5406 aesmc v0.16b, v0.16b //AES block 0 - round 6 5407 5408 aese v3.16b, v24.16b 5409 aesmc v3.16b, v3.16b //AES block 3 - round 6 5410 5411 aese v1.16b, v24.16b 5412 aesmc v1.16b, v1.16b //AES block 1 - round 6 5413 5414 aese v2.16b, v24.16b 5415 aesmc v2.16b, v2.16b //AES block 2 - round 6 5416 5417 aese v0.16b, v25.16b 5418 aesmc v0.16b, v0.16b //AES block 0 - round 7 5419 5420 aese v1.16b, v25.16b 5421 aesmc v1.16b, v1.16b //AES block 1 - round 7 5422 5423 aese v3.16b, v25.16b 5424 aesmc v3.16b, v3.16b //AES block 3 - round 7 5425 5426 aese v0.16b, v26.16b 5427 aesmc v0.16b, v0.16b //AES block 0 - round 8 5428 5429 aese v2.16b, v25.16b 5430 aesmc v2.16b, v2.16b //AES block 2 - round 7 5431 5432 aese v3.16b, v26.16b 5433 aesmc v3.16b, v3.16b //AES block 3 - round 8 5434 5435 aese v1.16b, v26.16b 5436 aesmc v1.16b, v1.16b //AES block 1 - round 8 5437 5438 aese v0.16b, v27.16b 5439 aesmc v0.16b, v0.16b //AES block 0 - round 9 5440 5441 aese v2.16b, v26.16b 5442 aesmc v2.16b, v2.16b //AES block 2 - round 8 5443 ld1 {v31.4s}, [x8], #16 //load rk13 5444 5445 aese v1.16b, v27.16b 5446 aesmc v1.16b, v1.16b //AES block 1 - round 9 5447 5448 aese v0.16b, v28.16b 5449 aesmc v0.16b, v0.16b //AES block 0 - round 10 5450 5451 aese v3.16b, v27.16b 5452 aesmc v3.16b, v3.16b //AES block 3 - round 9 5453 5454 aese v1.16b, v28.16b 5455 aesmc v1.16b, v1.16b //AES block 1 - round 10 5456 5457 aese v2.16b, v27.16b 5458 aesmc v2.16b, v2.16b //AES block 2 - round 9 5459 5460 aese v3.16b, v28.16b 5461 aesmc v3.16b, v3.16b //AES block 3 - round 10 5462 5463 aese v0.16b, v29.16b 5464 aesmc v0.16b, v0.16b //AES block 0 - round 11 5465 5466 aese v2.16b, v28.16b 5467 aesmc v2.16b, v2.16b //AES block 2 - round 10 5468 5469 aese v3.16b, v29.16b 5470 aesmc v3.16b, v3.16b //AES block 3 - round 11 5471 5472 aese v1.16b, v29.16b 5473 aesmc v1.16b, v1.16b //AES block 1 - round 11 5474 5475 aese v2.16b, v29.16b 5476 aesmc v2.16b, v2.16b //AES block 2 - round 11 5477 5478 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 5479 5480 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 5481 5482 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 5483 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 5484 5485 aese v1.16b, v30.16b 5486 aesmc v1.16b, v1.16b //AES block 1 - round 12 5487 5488 aese v0.16b, v30.16b 5489 aesmc v0.16b, v0.16b //AES block 0 - round 12 5490 5491 aese v2.16b, v30.16b 5492 aesmc v2.16b, v2.16b //AES block 2 - round 12 5493 5494 aese v3.16b, v30.16b 5495 aesmc v3.16b, v3.16b //AES block 3 - round 12 5496 eor v17.16b, v17.16b, v9.16b //h4k | h3k 5497 5498 aese v1.16b, v31.16b //AES block 1 - round 13 5499 5500 aese v2.16b, v31.16b //AES block 2 - round 13 5501 eor v16.16b, v16.16b, v8.16b //h2k | h1k 5502 5503 aese v3.16b, v31.16b //AES block 3 - round 13 5504 5505 aese v0.16b, v31.16b //AES block 0 - round 13 5506 b.ge .L256_dec_tail //handle tail 5507 5508 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 5509 5510 rev w9, w12 //CTR block 4 5511 5512 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 5513 5514 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 5515 rev64 v5.16b, v5.16b //GHASH block 1 5516 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 5517 5518 mov x7, v0.d[1] //AES block 0 - mov high 5519 5520 mov x6, v0.d[0] //AES block 0 - mov low 5521 rev64 v4.16b, v4.16b //GHASH block 0 5522 add w12, w12, #1 //CTR block 4 5523 5524 fmov d0, x10 //CTR block 4 5525 orr x9, x11, x9, lsl #32 //CTR block 4 5526 5527 fmov v0.d[1], x9 //CTR block 4 5528 rev w9, w12 //CTR block 5 5529 add w12, w12, #1 //CTR block 5 5530 5531 mov x19, v1.d[0] //AES block 1 - mov low 5532 5533 orr x9, x11, x9, lsl #32 //CTR block 5 5534 mov x20, v1.d[1] //AES block 1 - mov high 5535 eor x7, x7, x14 //AES block 0 - round 14 high 5536 #ifdef __AARCH64EB__ 5537 rev x7, x7 5538 #endif 5539 eor x6, x6, x13 //AES block 0 - round 14 low 5540 #ifdef __AARCH64EB__ 5541 rev x6, x6 5542 #endif 5543 stp x6, x7, [x2], #16 //AES block 0 - store result 5544 fmov d1, x10 //CTR block 5 5545 5546 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 5547 5548 fmov v1.d[1], x9 //CTR block 5 5549 rev w9, w12 //CTR block 6 5550 add w12, w12, #1 //CTR block 6 5551 5552 eor x19, x19, x13 //AES block 1 - round 14 low 5553 #ifdef __AARCH64EB__ 5554 rev x19, x19 5555 #endif 5556 orr x9, x11, x9, lsl #32 //CTR block 6 5557 5558 eor x20, x20, x14 //AES block 1 - round 14 high 5559 #ifdef __AARCH64EB__ 5560 rev x20, x20 5561 #endif 5562 stp x19, x20, [x2], #16 //AES block 1 - store result 5563 5564 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 5565 cmp x0, x5 //check if we have <= 8 blocks 5566 b.ge .L256_dec_prepretail //do prepretail 5567 5568 .L256_dec_main_loop: //main loop start 5569 mov x21, v2.d[0] //AES block 4k+2 - mov low 5570 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5571 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5572 5573 aese v0.16b, v18.16b 5574 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5575 mov x22, v2.d[1] //AES block 4k+2 - mov high 5576 5577 aese v1.16b, v18.16b 5578 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5579 fmov d2, x10 //CTR block 4k+6 5580 5581 fmov v2.d[1], x9 //CTR block 4k+6 5582 eor v4.16b, v4.16b, v11.16b //PRE 1 5583 rev w9, w12 //CTR block 4k+7 5584 5585 aese v0.16b, v19.16b 5586 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5587 mov x24, v3.d[1] //AES block 4k+3 - mov high 5588 5589 aese v1.16b, v19.16b 5590 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5591 mov x23, v3.d[0] //AES block 4k+3 - mov low 5592 5593 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5594 mov d8, v4.d[1] //GHASH block 4k - mid 5595 fmov d3, x10 //CTR block 4k+7 5596 5597 aese v0.16b, v20.16b 5598 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5599 orr x9, x11, x9, lsl #32 //CTR block 4k+7 5600 5601 aese v2.16b, v18.16b 5602 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5603 fmov v3.d[1], x9 //CTR block 4k+7 5604 5605 aese v1.16b, v20.16b 5606 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5607 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5608 5609 aese v0.16b, v21.16b 5610 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5611 eor x22, x22, x14 //AES block 4k+2 - round 14 high 5612 #ifdef __AARCH64EB__ 5613 rev x22, x22 5614 #endif 5615 aese v2.16b, v19.16b 5616 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5617 mov d10, v17.d[1] //GHASH block 4k - mid 5618 5619 aese v1.16b, v21.16b 5620 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5621 rev64 v6.16b, v6.16b //GHASH block 4k+2 5622 5623 aese v3.16b, v18.16b 5624 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5625 eor x21, x21, x13 //AES block 4k+2 - round 14 low 5626 #ifdef __AARCH64EB__ 5627 rev x21, x21 5628 #endif 5629 aese v2.16b, v20.16b 5630 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5631 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 5632 5633 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5634 5635 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5636 5637 aese v2.16b, v21.16b 5638 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5639 rev64 v7.16b, v7.16b //GHASH block 4k+3 5640 5641 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5642 eor x23, x23, x13 //AES block 4k+3 - round 14 low 5643 #ifdef __AARCH64EB__ 5644 rev x23, x23 5645 #endif 5646 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5647 eor x24, x24, x14 //AES block 4k+3 - round 14 high 5648 #ifdef __AARCH64EB__ 5649 rev x24, x24 5650 #endif 5651 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5652 5653 aese v2.16b, v22.16b 5654 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5655 5656 aese v3.16b, v19.16b 5657 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5658 mov d4, v5.d[1] //GHASH block 4k+1 - mid 5659 5660 aese v0.16b, v22.16b 5661 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5662 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5663 5664 aese v2.16b, v23.16b 5665 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 5666 add w12, w12, #1 //CTR block 4k+7 5667 5668 aese v3.16b, v20.16b 5669 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5670 mov d8, v6.d[1] //GHASH block 4k+2 - mid 5671 5672 aese v1.16b, v22.16b 5673 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5674 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5675 5676 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5677 5678 aese v3.16b, v21.16b 5679 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5680 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5681 5682 aese v1.16b, v23.16b 5683 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 5684 5685 aese v0.16b, v23.16b 5686 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5687 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5688 5689 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5690 rev w9, w12 //CTR block 4k+8 5691 5692 aese v1.16b, v24.16b 5693 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 5694 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5695 5696 aese v0.16b, v24.16b 5697 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 5698 add w12, w12, #1 //CTR block 4k+8 5699 5700 aese v3.16b, v22.16b 5701 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5702 5703 aese v1.16b, v25.16b 5704 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 5705 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5706 5707 aese v0.16b, v25.16b 5708 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 5709 5710 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5711 mov d6, v7.d[1] //GHASH block 4k+3 - mid 5712 5713 aese v3.16b, v23.16b 5714 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5715 5716 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5717 5718 aese v0.16b, v26.16b 5719 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 5720 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5721 5722 aese v3.16b, v24.16b 5723 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 5724 5725 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5726 orr x9, x11, x9, lsl #32 //CTR block 4k+8 5727 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 5728 5729 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5730 5731 aese v0.16b, v27.16b 5732 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 5733 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 5734 5735 aese v1.16b, v26.16b 5736 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 5737 5738 aese v2.16b, v24.16b 5739 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 5740 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 5741 5742 aese v0.16b, v28.16b 5743 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 5744 5745 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 5746 movi v8.8b, #0xc2 5747 5748 aese v2.16b, v25.16b 5749 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 5750 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 5751 5752 aese v0.16b, v29.16b 5753 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 5754 5755 aese v3.16b, v25.16b 5756 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 5757 shl d8, d8, #56 //mod_constant 5758 5759 aese v2.16b, v26.16b 5760 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 5761 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 5762 5763 aese v0.16b, v30.16b 5764 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 5765 5766 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5767 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5768 5769 aese v1.16b, v27.16b 5770 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 5771 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 5772 5773 aese v0.16b, v31.16b //AES block 4k+4 - round 13 5774 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5775 5776 aese v1.16b, v28.16b 5777 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 5778 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 5779 5780 aese v2.16b, v27.16b 5781 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 5782 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 5783 5784 aese v3.16b, v26.16b 5785 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 5786 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 5787 5788 aese v1.16b, v29.16b 5789 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 5790 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 5791 5792 aese v2.16b, v28.16b 5793 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 5794 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5795 5796 aese v3.16b, v27.16b 5797 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 5798 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 5799 5800 aese v1.16b, v30.16b 5801 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 5802 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 5803 5804 aese v2.16b, v29.16b 5805 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 5806 mov x7, v0.d[1] //AES block 4k+4 - mov high 5807 5808 aese v3.16b, v28.16b 5809 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 5810 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5811 5812 aese v1.16b, v31.16b //AES block 4k+5 - round 13 5813 mov x6, v0.d[0] //AES block 4k+4 - mov low 5814 5815 aese v2.16b, v30.16b 5816 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 5817 fmov d0, x10 //CTR block 4k+8 5818 5819 aese v3.16b, v29.16b 5820 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 5821 fmov v0.d[1], x9 //CTR block 4k+8 5822 5823 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5824 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 5825 rev w9, w12 //CTR block 4k+9 5826 5827 aese v2.16b, v31.16b //AES block 4k+6 - round 13 5828 orr x9, x11, x9, lsl #32 //CTR block 4k+9 5829 cmp x0, x5 //.LOOP CONTROL 5830 5831 add w12, w12, #1 //CTR block 4k+9 5832 5833 eor x6, x6, x13 //AES block 4k+4 - round 14 low 5834 #ifdef __AARCH64EB__ 5835 rev x6, x6 5836 #endif 5837 eor x7, x7, x14 //AES block 4k+4 - round 14 high 5838 #ifdef __AARCH64EB__ 5839 rev x7, x7 5840 #endif 5841 mov x20, v1.d[1] //AES block 4k+5 - mov high 5842 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 5843 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 5844 5845 aese v3.16b, v30.16b 5846 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 5847 mov x19, v1.d[0] //AES block 4k+5 - mov low 5848 5849 fmov d1, x10 //CTR block 4k+9 5850 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5851 5852 fmov v1.d[1], x9 //CTR block 4k+9 5853 rev w9, w12 //CTR block 4k+10 5854 add w12, w12, #1 //CTR block 4k+10 5855 5856 aese v3.16b, v31.16b //AES block 4k+7 - round 13 5857 orr x9, x11, x9, lsl #32 //CTR block 4k+10 5858 5859 rev64 v5.16b, v5.16b //GHASH block 4k+5 5860 eor x20, x20, x14 //AES block 4k+5 - round 14 high 5861 #ifdef __AARCH64EB__ 5862 rev x20, x20 5863 #endif 5864 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 5865 5866 eor x19, x19, x13 //AES block 4k+5 - round 14 low 5867 #ifdef __AARCH64EB__ 5868 rev x19, x19 5869 #endif 5870 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 5871 5872 rev64 v4.16b, v4.16b //GHASH block 4k+4 5873 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5874 b.lt .L256_dec_main_loop 5875 5876 5877 .L256_dec_prepretail: //PREPRETAIL 5878 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5879 mov x21, v2.d[0] //AES block 4k+2 - mov low 5880 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5881 5882 aese v0.16b, v18.16b 5883 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5884 mov x22, v2.d[1] //AES block 4k+2 - mov high 5885 5886 aese v1.16b, v18.16b 5887 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5888 fmov d2, x10 //CTR block 4k+6 5889 5890 fmov v2.d[1], x9 //CTR block 4k+6 5891 rev w9, w12 //CTR block 4k+7 5892 eor v4.16b, v4.16b, v11.16b //PRE 1 5893 5894 rev64 v6.16b, v6.16b //GHASH block 4k+2 5895 orr x9, x11, x9, lsl #32 //CTR block 4k+7 5896 mov x23, v3.d[0] //AES block 4k+3 - mov low 5897 5898 aese v1.16b, v19.16b 5899 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5900 mov x24, v3.d[1] //AES block 4k+3 - mov high 5901 5902 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5903 mov d8, v4.d[1] //GHASH block 4k - mid 5904 fmov d3, x10 //CTR block 4k+7 5905 5906 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5907 fmov v3.d[1], x9 //CTR block 4k+7 5908 5909 aese v2.16b, v18.16b 5910 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5911 mov d10, v17.d[1] //GHASH block 4k - mid 5912 5913 aese v0.16b, v19.16b 5914 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5915 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5916 5917 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5918 5919 aese v2.16b, v19.16b 5920 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5921 rev64 v7.16b, v7.16b //GHASH block 4k+3 5922 5923 aese v3.16b, v18.16b 5924 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5925 5926 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5927 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5928 5929 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5930 5931 aese v3.16b, v19.16b 5932 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5933 mov d4, v5.d[1] //GHASH block 4k+1 - mid 5934 5935 aese v0.16b, v20.16b 5936 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5937 5938 aese v1.16b, v20.16b 5939 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5940 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5941 5942 aese v2.16b, v20.16b 5943 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5944 5945 aese v0.16b, v21.16b 5946 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5947 mov d8, v6.d[1] //GHASH block 4k+2 - mid 5948 5949 aese v3.16b, v20.16b 5950 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5951 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5952 5953 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5954 5955 aese v0.16b, v22.16b 5956 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5957 5958 aese v3.16b, v21.16b 5959 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5960 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5961 5962 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5963 5964 aese v0.16b, v23.16b 5965 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5966 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5967 5968 aese v3.16b, v22.16b 5969 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5970 5971 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5972 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5973 5974 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5975 5976 aese v3.16b, v23.16b 5977 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5978 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5979 5980 aese v2.16b, v21.16b 5981 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5982 5983 aese v1.16b, v21.16b 5984 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5985 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5986 5987 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5988 5989 aese v2.16b, v22.16b 5990 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5991 mov d6, v7.d[1] //GHASH block 4k+3 - mid 5992 5993 aese v1.16b, v22.16b 5994 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5995 5996 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5997 5998 aese v2.16b, v23.16b 5999 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 6000 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 6001 6002 aese v1.16b, v23.16b 6003 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 6004 6005 aese v3.16b, v24.16b 6006 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 6007 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 6008 6009 aese v2.16b, v24.16b 6010 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 6011 6012 aese v0.16b, v24.16b 6013 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 6014 movi v8.8b, #0xc2 6015 6016 aese v1.16b, v24.16b 6017 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 6018 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 6019 6020 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 6021 6022 aese v3.16b, v25.16b 6023 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 6024 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 6025 6026 aese v1.16b, v25.16b 6027 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 6028 6029 aese v0.16b, v25.16b 6030 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 6031 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 6032 6033 aese v3.16b, v26.16b 6034 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 6035 6036 aese v2.16b, v25.16b 6037 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 6038 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6039 6040 aese v1.16b, v26.16b 6041 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 6042 6043 aese v0.16b, v26.16b 6044 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 6045 shl d8, d8, #56 //mod_constant 6046 6047 aese v2.16b, v26.16b 6048 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 6049 6050 aese v1.16b, v27.16b 6051 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 6052 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6053 6054 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6055 6056 aese v2.16b, v27.16b 6057 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 6058 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6059 6060 aese v3.16b, v27.16b 6061 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 6062 6063 aese v0.16b, v27.16b 6064 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 6065 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6066 6067 aese v2.16b, v28.16b 6068 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 6069 6070 aese v3.16b, v28.16b 6071 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 6072 6073 aese v0.16b, v28.16b 6074 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 6075 eor x22, x22, x14 //AES block 4k+2 - round 14 high 6076 #ifdef __AARCH64EB__ 6077 rev x22, x22 6078 #endif 6079 aese v1.16b, v28.16b 6080 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 6081 eor x23, x23, x13 //AES block 4k+3 - round 14 low 6082 #ifdef __AARCH64EB__ 6083 rev x23, x23 6084 #endif 6085 aese v2.16b, v29.16b 6086 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 6087 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6088 6089 aese v0.16b, v29.16b 6090 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 6091 add w12, w12, #1 //CTR block 4k+7 6092 6093 aese v1.16b, v29.16b 6094 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 6095 eor x21, x21, x13 //AES block 4k+2 - round 14 low 6096 #ifdef __AARCH64EB__ 6097 rev x21, x21 6098 #endif 6099 6100 aese v2.16b, v30.16b 6101 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 6102 6103 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6104 eor x24, x24, x14 //AES block 4k+3 - round 14 high 6105 #ifdef __AARCH64EB__ 6106 rev x24, x24 6107 #endif 6108 6109 aese v3.16b, v29.16b 6110 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 6111 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 6112 6113 aese v1.16b, v30.16b 6114 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 6115 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6116 6117 aese v0.16b, v30.16b 6118 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 6119 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 6120 6121 aese v3.16b, v30.16b 6122 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 6123 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6124 6125 aese v1.16b, v31.16b //AES block 4k+5 - round 13 6126 6127 aese v0.16b, v31.16b //AES block 4k+4 - round 13 6128 6129 aese v3.16b, v31.16b //AES block 4k+7 - round 13 6130 6131 aese v2.16b, v31.16b //AES block 4k+6 - round 13 6132 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6133 .L256_dec_tail: //TAIL 6134 6135 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 6136 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 6137 6138 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 6139 6140 mov x6, v0.d[0] //AES block 4k+4 - mov low 6141 6142 mov x7, v0.d[1] //AES block 4k+4 - mov high 6143 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 6144 6145 cmp x5, #48 6146 6147 eor x6, x6, x13 //AES block 4k+4 - round 14 low 6148 #ifdef __AARCH64EB__ 6149 rev x6, x6 6150 #endif 6151 6152 eor x7, x7, x14 //AES block 4k+4 - round 14 high 6153 #ifdef __AARCH64EB__ 6154 rev x7, x7 6155 #endif 6156 b.gt .L256_dec_blocks_more_than_3 6157 6158 sub w12, w12, #1 6159 mov v3.16b, v2.16b 6160 movi v10.8b, #0 6161 6162 movi v11.8b, #0 6163 cmp x5, #32 6164 6165 movi v9.8b, #0 6166 mov v2.16b, v1.16b 6167 b.gt .L256_dec_blocks_more_than_2 6168 6169 sub w12, w12, #1 6170 6171 mov v3.16b, v1.16b 6172 cmp x5, #16 6173 b.gt .L256_dec_blocks_more_than_1 6174 6175 sub w12, w12, #1 6176 b .L256_dec_blocks_less_than_1 6177 .L256_dec_blocks_more_than_3: //blocks left > 3 6178 rev64 v4.16b, v5.16b //GHASH final-3 block 6179 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 6180 6181 stp x6, x7, [x2], #16 //AES final-3 block - store result 6182 6183 mov d10, v17.d[1] //GHASH final-3 block - mid 6184 6185 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6186 6187 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 6188 6189 mov d22, v4.d[1] //GHASH final-3 block - mid 6190 6191 mov x6, v0.d[0] //AES final-2 block - mov low 6192 6193 mov x7, v0.d[1] //AES final-2 block - mov high 6194 6195 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 6196 6197 movi v8.8b, #0 //suppress further partial tag feed in 6198 6199 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 6200 6201 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 6202 eor x6, x6, x13 //AES final-2 block - round 14 low 6203 #ifdef __AARCH64EB__ 6204 rev x6, x6 6205 #endif 6206 6207 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 6208 eor x7, x7, x14 //AES final-2 block - round 14 high 6209 #ifdef __AARCH64EB__ 6210 rev x7, x7 6211 #endif 6212 .L256_dec_blocks_more_than_2: //blocks left > 2 6213 6214 rev64 v4.16b, v5.16b //GHASH final-2 block 6215 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 6216 6217 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6218 stp x6, x7, [x2], #16 //AES final-2 block - store result 6219 6220 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 6221 6222 mov d22, v4.d[1] //GHASH final-2 block - mid 6223 6224 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 6225 6226 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 6227 6228 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 6229 mov x6, v0.d[0] //AES final-1 block - mov low 6230 6231 mov x7, v0.d[1] //AES final-1 block - mov high 6232 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 6233 movi v8.8b, #0 //suppress further partial tag feed in 6234 6235 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 6236 6237 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 6238 eor x6, x6, x13 //AES final-1 block - round 14 low 6239 #ifdef __AARCH64EB__ 6240 rev x6, x6 6241 #endif 6242 6243 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 6244 eor x7, x7, x14 //AES final-1 block - round 14 high 6245 #ifdef __AARCH64EB__ 6246 rev x7, x7 6247 #endif 6248 .L256_dec_blocks_more_than_1: //blocks left > 1 6249 6250 stp x6, x7, [x2], #16 //AES final-1 block - store result 6251 rev64 v4.16b, v5.16b //GHASH final-1 block 6252 6253 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 6254 6255 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6256 movi v8.8b, #0 //suppress further partial tag feed in 6257 6258 mov d22, v4.d[1] //GHASH final-1 block - mid 6259 6260 eor v0.16b, v5.16b, v3.16b //AES final block - result 6261 6262 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 6263 6264 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 6265 6266 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 6267 mov x6, v0.d[0] //AES final block - mov low 6268 6269 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 6270 6271 mov x7, v0.d[1] //AES final block - mov high 6272 6273 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 6274 eor x6, x6, x13 //AES final block - round 14 low 6275 #ifdef __AARCH64EB__ 6276 rev x6, x6 6277 #endif 6278 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 6279 6280 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 6281 6282 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 6283 eor x7, x7, x14 //AES final block - round 14 high 6284 #ifdef __AARCH64EB__ 6285 rev x7, x7 6286 #endif 6287 .L256_dec_blocks_less_than_1: //blocks left <= 1 6288 6289 and x1, x1, #127 //bit_length %= 128 6290 mvn x14, xzr //rk14_h = 0xffffffffffffffff 6291 6292 sub x1, x1, #128 //bit_length -= 128 6293 mvn x13, xzr //rk14_l = 0xffffffffffffffff 6294 6295 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 6296 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 6297 6298 and x1, x1, #127 //bit_length %= 128 6299 6300 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 6301 cmp x1, #64 6302 6303 csel x9, x13, x14, lt 6304 csel x10, x14, xzr, lt 6305 6306 fmov d0, x9 //ctr0b is mask for last block 6307 and x6, x6, x9 6308 6309 mov v0.d[1], x10 6310 bic x4, x4, x9 //mask out low existing bytes 6311 6312 #ifndef __AARCH64EB__ 6313 rev w9, w12 6314 #else 6315 mov w9, w12 6316 #endif 6317 6318 bic x5, x5, x10 //mask out high existing bytes 6319 6320 orr x6, x6, x4 6321 6322 and x7, x7, x10 6323 6324 orr x7, x7, x5 6325 6326 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 6327 6328 rev64 v4.16b, v5.16b //GHASH final block 6329 6330 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6331 6332 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 6333 6334 mov d8, v4.d[1] //GHASH final block - mid 6335 6336 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 6337 6338 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 6339 6340 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 6341 6342 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 6343 6344 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 6345 6346 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 6347 movi v8.8b, #0xc2 6348 6349 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6350 6351 shl d8, d8, #56 //mod_constant 6352 6353 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6354 6355 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6356 6357 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6358 6359 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6360 6361 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6362 6363 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6364 6365 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6366 6367 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6368 6369 stp x6, x7, [x2] 6370 6371 str w9, [x16, #12] //store the updated counter 6372 6373 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6374 ext v11.16b, v11.16b, v11.16b, #8 6375 rev64 v11.16b, v11.16b 6376 mov x0, x15 6377 st1 { v11.16b }, [x3] 6378 6379 ldp x21, x22, [sp, #16] 6380 ldp x23, x24, [sp, #32] 6381 ldp d8, d9, [sp, #48] 6382 ldp d10, d11, [sp, #64] 6383 ldp d12, d13, [sp, #80] 6384 ldp d14, d15, [sp, #96] 6385 ldp x19, x20, [sp], #112 6386 ret 6387 6388 .L256_dec_ret: 6389 mov w0, #0x0 6390 ret 6391 .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6392 .section .rodata 6393 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 6394 .align 2 6395 .align 2 6396 #endif 6397