1 1.1 christos #include "arm_arch.h" 2 1.1 christos 3 1.1 christos #if __ARM_MAX_ARCH__>=8 4 1.1 christos .arch armv8-a+crypto 5 1.1 christos .text 6 1.1 christos .globl aes_gcm_enc_128_kernel 7 1.1 christos .type aes_gcm_enc_128_kernel,%function 8 1.1 christos .align 4 9 1.1 christos aes_gcm_enc_128_kernel: 10 1.2 christos AARCH64_VALID_CALL_TARGET 11 1.1 christos cbz x1, .L128_enc_ret 12 1.1 christos stp x19, x20, [sp, #-112]! 13 1.1 christos mov x16, x4 14 1.1 christos mov x8, x5 15 1.1 christos stp x21, x22, [sp, #16] 16 1.1 christos stp x23, x24, [sp, #32] 17 1.1 christos stp d8, d9, [sp, #48] 18 1.1 christos stp d10, d11, [sp, #64] 19 1.1 christos stp d12, d13, [sp, #80] 20 1.1 christos stp d14, d15, [sp, #96] 21 1.1 christos 22 1.1 christos ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 23 1.1 christos #ifdef __AARCH64EB__ 24 1.1 christos rev x10, x10 25 1.1 christos rev x11, x11 26 1.1 christos #endif 27 1.1 christos ldp x13, x14, [x8, #160] //load rk10 28 1.1 christos #ifdef __AARCH64EB__ 29 1.1 christos ror x13, x13, #32 30 1.1 christos ror x14, x14, #32 31 1.1 christos #endif 32 1.1 christos ld1 {v11.16b}, [x3] 33 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 34 1.1 christos rev64 v11.16b, v11.16b 35 1.1 christos lsr x5, x1, #3 //byte_len 36 1.1 christos mov x15, x5 37 1.1 christos 38 1.1 christos ld1 {v18.4s}, [x8], #16 //load rk0 39 1.1 christos add x4, x0, x1, lsr #3 //end_input_ptr 40 1.1 christos sub x5, x5, #1 //byte_len - 1 41 1.1 christos 42 1.1 christos lsr x12, x11, #32 43 1.1 christos ldr q15, [x3, #112] //load h4l | h4h 44 1.1 christos #ifndef __AARCH64EB__ 45 1.1 christos ext v15.16b, v15.16b, v15.16b, #8 46 1.1 christos #endif 47 1.1 christos fmov d1, x10 //CTR block 1 48 1.1 christos rev w12, w12 //rev_ctr32 49 1.1 christos 50 1.1 christos add w12, w12, #1 //increment rev_ctr32 51 1.1 christos orr w11, w11, w11 52 1.1 christos ld1 {v19.4s}, [x8], #16 //load rk1 53 1.1 christos 54 1.1 christos rev w9, w12 //CTR block 1 55 1.1 christos add w12, w12, #1 //CTR block 1 56 1.1 christos fmov d3, x10 //CTR block 3 57 1.1 christos 58 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 1 59 1.1 christos ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 60 1.1 christos 61 1.1 christos fmov v1.d[1], x9 //CTR block 1 62 1.1 christos rev w9, w12 //CTR block 2 63 1.1 christos 64 1.1 christos fmov d2, x10 //CTR block 2 65 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 2 66 1.1 christos add w12, w12, #1 //CTR block 2 67 1.1 christos 68 1.1 christos fmov v2.d[1], x9 //CTR block 2 69 1.1 christos rev w9, w12 //CTR block 3 70 1.1 christos 71 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 3 72 1.1 christos ld1 {v20.4s}, [x8], #16 //load rk2 73 1.1 christos 74 1.1 christos add w12, w12, #1 //CTR block 3 75 1.1 christos fmov v3.d[1], x9 //CTR block 3 76 1.1 christos 77 1.1 christos ldr q14, [x3, #80] //load h3l | h3h 78 1.1 christos #ifndef __AARCH64EB__ 79 1.1 christos ext v14.16b, v14.16b, v14.16b, #8 80 1.1 christos #endif 81 1.1 christos aese v1.16b, v18.16b 82 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 0 83 1.1 christos ld1 {v21.4s}, [x8], #16 //load rk3 84 1.1 christos 85 1.1 christos aese v2.16b, v18.16b 86 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 0 87 1.1 christos ldr q12, [x3, #32] //load h1l | h1h 88 1.1 christos #ifndef __AARCH64EB__ 89 1.1 christos ext v12.16b, v12.16b, v12.16b, #8 90 1.1 christos #endif 91 1.1 christos 92 1.1 christos aese v0.16b, v18.16b 93 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 0 94 1.1 christos ld1 {v22.4s}, [x8], #16 //load rk4 95 1.1 christos 96 1.1 christos aese v3.16b, v18.16b 97 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 0 98 1.1 christos ld1 {v23.4s}, [x8], #16 //load rk5 99 1.1 christos 100 1.1 christos aese v2.16b, v19.16b 101 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 1 102 1.1 christos trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 103 1.1 christos 104 1.1 christos aese v0.16b, v19.16b 105 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 1 106 1.1 christos ld1 {v24.4s}, [x8], #16 //load rk6 107 1.1 christos 108 1.1 christos aese v1.16b, v19.16b 109 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 1 110 1.1 christos ld1 {v25.4s}, [x8], #16 //load rk7 111 1.1 christos 112 1.1 christos aese v3.16b, v19.16b 113 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 1 114 1.1 christos trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 115 1.1 christos 116 1.1 christos aese v0.16b, v20.16b 117 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 2 118 1.1 christos ld1 {v26.4s}, [x8], #16 //load rk8 119 1.1 christos 120 1.1 christos aese v1.16b, v20.16b 121 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 2 122 1.1 christos ldr q13, [x3, #64] //load h2l | h2h 123 1.1 christos #ifndef __AARCH64EB__ 124 1.1 christos ext v13.16b, v13.16b, v13.16b, #8 125 1.1 christos #endif 126 1.1 christos 127 1.1 christos aese v3.16b, v20.16b 128 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 2 129 1.1 christos 130 1.1 christos aese v2.16b, v20.16b 131 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 2 132 1.1 christos eor v17.16b, v17.16b, v9.16b //h4k | h3k 133 1.1 christos 134 1.1 christos aese v0.16b, v21.16b 135 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 3 136 1.1 christos 137 1.1 christos aese v1.16b, v21.16b 138 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 3 139 1.1 christos 140 1.1 christos aese v2.16b, v21.16b 141 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 3 142 1.1 christos ld1 {v27.4s}, [x8], #16 //load rk9 143 1.1 christos 144 1.1 christos aese v3.16b, v21.16b 145 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 3 146 1.1 christos 147 1.1 christos and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 148 1.1 christos trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 149 1.1 christos 150 1.1 christos aese v3.16b, v22.16b 151 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 4 152 1.1 christos add x5, x5, x0 153 1.1 christos 154 1.1 christos aese v2.16b, v22.16b 155 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 4 156 1.1 christos cmp x0, x5 //check if we have <= 4 blocks 157 1.1 christos 158 1.1 christos aese v0.16b, v22.16b 159 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 4 160 1.1 christos 161 1.1 christos aese v3.16b, v23.16b 162 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 5 163 1.1 christos 164 1.1 christos aese v2.16b, v23.16b 165 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 5 166 1.1 christos 167 1.1 christos aese v0.16b, v23.16b 168 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 5 169 1.1 christos 170 1.1 christos aese v3.16b, v24.16b 171 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 6 172 1.1 christos 173 1.1 christos aese v1.16b, v22.16b 174 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 4 175 1.1 christos 176 1.1 christos aese v2.16b, v24.16b 177 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 6 178 1.1 christos trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 179 1.1 christos 180 1.1 christos aese v0.16b, v24.16b 181 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 6 182 1.1 christos 183 1.1 christos aese v1.16b, v23.16b 184 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 5 185 1.1 christos 186 1.1 christos aese v3.16b, v25.16b 187 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 7 188 1.1 christos 189 1.1 christos aese v0.16b, v25.16b 190 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 7 191 1.1 christos 192 1.1 christos aese v1.16b, v24.16b 193 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 6 194 1.1 christos 195 1.1 christos aese v2.16b, v25.16b 196 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 7 197 1.1 christos 198 1.1 christos aese v0.16b, v26.16b 199 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 8 200 1.1 christos 201 1.1 christos aese v1.16b, v25.16b 202 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 7 203 1.1 christos 204 1.1 christos aese v2.16b, v26.16b 205 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 8 206 1.1 christos 207 1.1 christos aese v3.16b, v26.16b 208 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 8 209 1.1 christos 210 1.1 christos aese v1.16b, v26.16b 211 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 8 212 1.1 christos 213 1.1 christos aese v2.16b, v27.16b //AES block 2 - round 9 214 1.1 christos 215 1.1 christos aese v0.16b, v27.16b //AES block 0 - round 9 216 1.1 christos 217 1.1 christos eor v16.16b, v16.16b, v8.16b //h2k | h1k 218 1.1 christos 219 1.1 christos aese v1.16b, v27.16b //AES block 1 - round 9 220 1.1 christos 221 1.1 christos aese v3.16b, v27.16b //AES block 3 - round 9 222 1.1 christos b.ge .L128_enc_tail //handle tail 223 1.1 christos 224 1.1 christos ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 225 1.1 christos #ifdef __AARCH64EB__ 226 1.1 christos rev x6, x6 227 1.1 christos rev x7, x7 228 1.1 christos #endif 229 1.1 christos ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 230 1.1 christos #ifdef __AARCH64EB__ 231 1.1 christos rev x21, x21 232 1.1 christos rev x22, x22 233 1.1 christos #endif 234 1.1 christos ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 235 1.1 christos #ifdef __AARCH64EB__ 236 1.1 christos rev x19, x19 237 1.1 christos rev x20, x20 238 1.1 christos #endif 239 1.1 christos ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 240 1.1 christos #ifdef __AARCH64EB__ 241 1.1 christos rev x23, x23 242 1.1 christos rev x24, x24 243 1.1 christos #endif 244 1.1 christos eor x6, x6, x13 //AES block 0 - round 10 low 245 1.1 christos eor x7, x7, x14 //AES block 0 - round 10 high 246 1.1 christos 247 1.1 christos eor x21, x21, x13 //AES block 2 - round 10 low 248 1.1 christos fmov d4, x6 //AES block 0 - mov low 249 1.1 christos 250 1.1 christos eor x19, x19, x13 //AES block 1 - round 10 low 251 1.1 christos eor x22, x22, x14 //AES block 2 - round 10 high 252 1.1 christos fmov v4.d[1], x7 //AES block 0 - mov high 253 1.1 christos 254 1.1 christos fmov d5, x19 //AES block 1 - mov low 255 1.1 christos eor x20, x20, x14 //AES block 1 - round 10 high 256 1.1 christos 257 1.1 christos eor x23, x23, x13 //AES block 3 - round 10 low 258 1.1 christos fmov v5.d[1], x20 //AES block 1 - mov high 259 1.1 christos 260 1.1 christos fmov d6, x21 //AES block 2 - mov low 261 1.1 christos eor x24, x24, x14 //AES block 3 - round 10 high 262 1.1 christos rev w9, w12 //CTR block 4 263 1.1 christos 264 1.1 christos fmov v6.d[1], x22 //AES block 2 - mov high 265 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4 266 1.1 christos 267 1.1 christos eor v4.16b, v4.16b, v0.16b //AES block 0 - result 268 1.1 christos fmov d0, x10 //CTR block 4 269 1.1 christos add w12, w12, #1 //CTR block 4 270 1.1 christos 271 1.1 christos fmov v0.d[1], x9 //CTR block 4 272 1.1 christos rev w9, w12 //CTR block 5 273 1.1 christos 274 1.1 christos eor v5.16b, v5.16b, v1.16b //AES block 1 - result 275 1.1 christos fmov d1, x10 //CTR block 5 276 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 5 277 1.1 christos 278 1.1 christos add w12, w12, #1 //CTR block 5 279 1.1 christos add x0, x0, #64 //AES input_ptr update 280 1.1 christos fmov v1.d[1], x9 //CTR block 5 281 1.1 christos 282 1.1 christos fmov d7, x23 //AES block 3 - mov low 283 1.1 christos rev w9, w12 //CTR block 6 284 1.1 christos st1 { v4.16b}, [x2], #16 //AES block 0 - store result 285 1.1 christos 286 1.1 christos fmov v7.d[1], x24 //AES block 3 - mov high 287 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 6 288 1.1 christos 289 1.1 christos add w12, w12, #1 //CTR block 6 290 1.1 christos eor v6.16b, v6.16b, v2.16b //AES block 2 - result 291 1.1 christos st1 { v5.16b}, [x2], #16 //AES block 1 - store result 292 1.1 christos 293 1.1 christos fmov d2, x10 //CTR block 6 294 1.1 christos cmp x0, x5 //check if we have <= 8 blocks 295 1.1 christos 296 1.1 christos fmov v2.d[1], x9 //CTR block 6 297 1.1 christos rev w9, w12 //CTR block 7 298 1.1 christos st1 { v6.16b}, [x2], #16 //AES block 2 - store result 299 1.1 christos 300 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 7 301 1.1 christos 302 1.1 christos eor v7.16b, v7.16b, v3.16b //AES block 3 - result 303 1.1 christos st1 { v7.16b}, [x2], #16 //AES block 3 - store result 304 1.1 christos b.ge .L128_enc_prepretail //do prepretail 305 1.1 christos 306 1.1 christos .L128_enc_main_loop: //main loop start 307 1.1 christos ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 308 1.1 christos #ifdef __AARCH64EB__ 309 1.1 christos rev x23, x23 310 1.1 christos rev x24, x24 311 1.1 christos #endif 312 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 313 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 314 1.1 christos 315 1.1 christos aese v2.16b, v18.16b 316 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 317 1.1 christos fmov d3, x10 //CTR block 4k+3 318 1.1 christos 319 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 320 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 321 1.1 christos 322 1.1 christos aese v1.16b, v18.16b 323 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 324 1.1 christos add w12, w12, #1 //CTR block 4k+3 325 1.1 christos fmov v3.d[1], x9 //CTR block 4k+3 326 1.1 christos 327 1.1 christos aese v0.16b, v18.16b 328 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 329 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 330 1.1 christos 331 1.1 christos aese v2.16b, v19.16b 332 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 333 1.1 christos mov d30, v5.d[1] //GHASH block 4k+1 - mid 334 1.1 christos 335 1.1 christos aese v1.16b, v19.16b 336 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 337 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 338 1.1 christos 339 1.1 christos aese v3.16b, v18.16b 340 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 341 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 10 high 342 1.1 christos 343 1.1 christos pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 344 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 345 1.1 christos ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 346 1.1 christos #ifdef __AARCH64EB__ 347 1.1 christos rev x6, x6 348 1.1 christos rev x7, x7 349 1.1 christos #endif 350 1.1 christos aese v0.16b, v19.16b 351 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 352 1.1 christos rev w9, w12 //CTR block 4k+8 353 1.1 christos 354 1.1 christos eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 355 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 356 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+8 357 1.1 christos 358 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 359 1.1 christos add w12, w12, #1 //CTR block 4k+8 360 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 361 1.1 christos 362 1.1 christos aese v0.16b, v20.16b 363 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 364 1.1 christos 365 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 366 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 367 1.1 christos 368 1.1 christos aese v1.16b, v20.16b 369 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 370 1.1 christos 371 1.1 christos aese v0.16b, v21.16b 372 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 373 1.1 christos eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 374 1.1 christos 375 1.1 christos pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 376 1.1 christos 377 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 378 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 379 1.1 christos 380 1.1 christos pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 381 1.1 christos 382 1.1 christos pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 383 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 384 1.1 christos 385 1.1 christos pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 386 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 10 high 387 1.1 christos 388 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 389 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 390 1.1 christos 391 1.1 christos aese v3.16b, v19.16b 392 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 393 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 394 1.1 christos 395 1.1 christos aese v2.16b, v20.16b 396 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 397 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 10 low 398 1.1 christos 399 1.1 christos aese v1.16b, v21.16b 400 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 401 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 402 1.1 christos 403 1.1 christos pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 404 1.1 christos 405 1.1 christos aese v2.16b, v21.16b 406 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 407 1.1 christos eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 408 1.1 christos 409 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 410 1.1 christos 411 1.1 christos pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 412 1.1 christos movi v8.8b, #0xc2 413 1.1 christos 414 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 415 1.1 christos eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 416 1.1 christos 417 1.1 christos aese v1.16b, v22.16b 418 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 419 1.1 christos 420 1.1 christos aese v3.16b, v20.16b 421 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 422 1.1 christos shl d8, d8, #56 //mod_constant 423 1.1 christos 424 1.1 christos aese v0.16b, v22.16b 425 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 426 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 427 1.1 christos 428 1.1 christos aese v1.16b, v23.16b 429 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 430 1.1 christos ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 431 1.1 christos #ifdef __AARCH64EB__ 432 1.1 christos rev x19, x19 433 1.1 christos rev x20, x20 434 1.1 christos #endif 435 1.1 christos aese v3.16b, v21.16b 436 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 437 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 438 1.1 christos 439 1.1 christos aese v0.16b, v23.16b 440 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 441 1.1 christos ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 442 1.1 christos #ifdef __AARCH64EB__ 443 1.1 christos rev x21, x21 444 1.1 christos rev x22, x22 445 1.1 christos #endif 446 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 447 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 448 1.1 christos 449 1.1 christos aese v2.16b, v22.16b 450 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 451 1.1 christos eor x19, x19, x13 //AES block 4k+5 - round 10 low 452 1.1 christos 453 1.1 christos aese v3.16b, v22.16b 454 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 455 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 456 1.1 christos 457 1.1 christos aese v1.16b, v24.16b 458 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 459 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 10 low 460 1.1 christos 461 1.1 christos aese v2.16b, v23.16b 462 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 463 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 464 1.1 christos 465 1.1 christos fmov d4, x6 //AES block 4k+4 - mov low 466 1.1 christos aese v0.16b, v24.16b 467 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 468 1.1 christos fmov v4.d[1], x7 //AES block 4k+4 - mov high 469 1.1 christos 470 1.1 christos add x0, x0, #64 //AES input_ptr update 471 1.1 christos fmov d7, x23 //AES block 4k+3 - mov low 472 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 473 1.1 christos 474 1.1 christos aese v3.16b, v23.16b 475 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 476 1.1 christos fmov d5, x19 //AES block 4k+5 - mov low 477 1.1 christos 478 1.1 christos aese v0.16b, v25.16b 479 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 480 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 481 1.1 christos 482 1.1 christos aese v2.16b, v24.16b 483 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 484 1.1 christos eor x20, x20, x14 //AES block 4k+5 - round 10 high 485 1.1 christos 486 1.1 christos aese v1.16b, v25.16b 487 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 488 1.1 christos fmov v5.d[1], x20 //AES block 4k+5 - mov high 489 1.1 christos 490 1.1 christos aese v0.16b, v26.16b 491 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 492 1.1 christos fmov v7.d[1], x24 //AES block 4k+3 - mov high 493 1.1 christos 494 1.1 christos aese v3.16b, v24.16b 495 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 496 1.1 christos cmp x0, x5 //.LOOP CONTROL 497 1.1 christos 498 1.1 christos aese v1.16b, v26.16b 499 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 500 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 501 1.1 christos 502 1.1 christos aese v0.16b, v27.16b //AES block 4k+4 - round 9 503 1.1 christos eor x21, x21, x13 //AES block 4k+6 - round 10 low 504 1.1 christos eor x22, x22, x14 //AES block 4k+6 - round 10 high 505 1.1 christos 506 1.1 christos aese v3.16b, v25.16b 507 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 508 1.1 christos fmov d6, x21 //AES block 4k+6 - mov low 509 1.1 christos 510 1.1 christos aese v1.16b, v27.16b //AES block 4k+5 - round 9 511 1.1 christos fmov v6.d[1], x22 //AES block 4k+6 - mov high 512 1.1 christos 513 1.1 christos aese v2.16b, v25.16b 514 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 515 1.1 christos eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 516 1.1 christos 517 1.1 christos fmov d0, x10 //CTR block 4k+8 518 1.1 christos aese v3.16b, v26.16b 519 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 520 1.1 christos 521 1.1 christos fmov v0.d[1], x9 //CTR block 4k+8 522 1.1 christos rev w9, w12 //CTR block 4k+9 523 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 524 1.1 christos 525 1.1 christos aese v2.16b, v26.16b 526 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 527 1.1 christos eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 528 1.1 christos 529 1.1 christos add w12, w12, #1 //CTR block 4k+9 530 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+9 531 1.1 christos fmov d1, x10 //CTR block 4k+9 532 1.1 christos 533 1.1 christos pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 534 1.1 christos fmov v1.d[1], x9 //CTR block 4k+9 535 1.1 christos rev w9, w12 //CTR block 4k+10 536 1.1 christos 537 1.1 christos aese v2.16b, v27.16b //AES block 4k+6 - round 9 538 1.1 christos st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 539 1.1 christos eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 540 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+10 541 1.1 christos 542 1.1 christos aese v3.16b, v27.16b //AES block 4k+7 - round 9 543 1.1 christos add w12, w12, #1 //CTR block 4k+10 544 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 545 1.1 christos fmov d2, x10 //CTR block 4k+10 546 1.1 christos 547 1.1 christos eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 548 1.1 christos st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 549 1.1 christos 550 1.1 christos fmov v2.d[1], x9 //CTR block 4k+10 551 1.1 christos st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 552 1.1 christos rev w9, w12 //CTR block 4k+11 553 1.1 christos 554 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+11 555 1.1 christos eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 556 1.1 christos 557 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 558 1.1 christos st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 559 1.1 christos b.lt .L128_enc_main_loop 560 1.1 christos 561 1.1 christos .L128_enc_prepretail: //PREPRETAIL 562 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 563 1.1 christos fmov d3, x10 //CTR block 4k+3 564 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 565 1.1 christos 566 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 567 1.1 christos add w12, w12, #1 //CTR block 4k+3 568 1.1 christos fmov v3.d[1], x9 //CTR block 4k+3 569 1.1 christos 570 1.1 christos aese v1.16b, v18.16b 571 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 572 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 573 1.1 christos 574 1.1 christos pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 575 1.1 christos 576 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 577 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 578 1.1 christos 579 1.1 christos pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 580 1.1 christos 581 1.1 christos aese v3.16b, v18.16b 582 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 583 1.1 christos mov d30, v5.d[1] //GHASH block 4k+1 - mid 584 1.1 christos 585 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 586 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 587 1.1 christos 588 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 589 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 590 1.1 christos 591 1.1 christos aese v1.16b, v19.16b 592 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 593 1.1 christos eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 594 1.1 christos 595 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 596 1.1 christos 597 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 598 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 599 1.1 christos 600 1.1 christos aese v3.16b, v19.16b 601 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 602 1.1 christos 603 1.1 christos pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 604 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 605 1.1 christos 606 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 607 1.1 christos 608 1.1 christos aese v0.16b, v18.16b 609 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 610 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 611 1.1 christos 612 1.1 christos aese v2.16b, v18.16b 613 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 614 1.1 christos 615 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 616 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 617 1.1 christos 618 1.1 christos aese v0.16b, v19.16b 619 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 620 1.1 christos eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 621 1.1 christos 622 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 623 1.1 christos 624 1.1 christos pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 625 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 626 1.1 christos 627 1.1 christos pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 628 1.1 christos 629 1.1 christos pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 630 1.1 christos 631 1.1 christos aese v2.16b, v19.16b 632 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 633 1.1 christos eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 634 1.1 christos 635 1.1 christos aese v0.16b, v20.16b 636 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 637 1.1 christos 638 1.1 christos pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 639 1.1 christos movi v8.8b, #0xc2 640 1.1 christos 641 1.1 christos aese v2.16b, v20.16b 642 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 643 1.1 christos eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 644 1.1 christos 645 1.1 christos aese v3.16b, v20.16b 646 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 647 1.1 christos 648 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 649 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 650 1.1 christos 651 1.1 christos aese v2.16b, v21.16b 652 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 653 1.1 christos 654 1.1 christos aese v1.16b, v20.16b 655 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 656 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 657 1.1 christos 658 1.1 christos aese v0.16b, v21.16b 659 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 660 1.1 christos 661 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 662 1.1 christos shl d8, d8, #56 //mod_constant 663 1.1 christos 664 1.1 christos aese v1.16b, v21.16b 665 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 666 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 667 1.1 christos 668 1.1 christos aese v0.16b, v22.16b 669 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 670 1.1 christos 671 1.1 christos pmull v28.1q, v9.1d, v8.1d 672 1.1 christos eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 673 1.1 christos 674 1.1 christos aese v1.16b, v22.16b 675 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 676 1.1 christos 677 1.1 christos aese v0.16b, v23.16b 678 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 679 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 680 1.1 christos 681 1.1 christos aese v3.16b, v21.16b 682 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 683 1.1 christos 684 1.1 christos aese v2.16b, v22.16b 685 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 686 1.1 christos eor v10.16b, v10.16b, v11.16b 687 1.1 christos 688 1.1 christos aese v0.16b, v24.16b 689 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 690 1.1 christos 691 1.1 christos aese v3.16b, v22.16b 692 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 693 1.1 christos 694 1.1 christos aese v1.16b, v23.16b 695 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 696 1.1 christos 697 1.1 christos aese v2.16b, v23.16b 698 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 699 1.1 christos eor v10.16b, v10.16b, v28.16b 700 1.1 christos 701 1.1 christos aese v3.16b, v23.16b 702 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 703 1.1 christos 704 1.1 christos aese v1.16b, v24.16b 705 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 706 1.1 christos 707 1.1 christos aese v2.16b, v24.16b 708 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 709 1.1 christos 710 1.1 christos aese v3.16b, v24.16b 711 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 712 1.1 christos eor v10.16b, v10.16b, v9.16b 713 1.1 christos 714 1.1 christos aese v0.16b, v25.16b 715 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 716 1.1 christos 717 1.1 christos aese v2.16b, v25.16b 718 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 719 1.1 christos 720 1.1 christos aese v3.16b, v25.16b 721 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 722 1.1 christos 723 1.1 christos pmull v28.1q, v10.1d, v8.1d 724 1.1 christos 725 1.1 christos aese v1.16b, v25.16b 726 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 727 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 728 1.1 christos 729 1.1 christos aese v3.16b, v26.16b 730 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 731 1.1 christos 732 1.1 christos aese v0.16b, v26.16b 733 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 734 1.1 christos eor v11.16b, v11.16b, v28.16b 735 1.1 christos 736 1.1 christos aese v1.16b, v26.16b 737 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 738 1.1 christos 739 1.1 christos aese v3.16b, v27.16b //AES block 4k+7 - round 9 740 1.1 christos 741 1.1 christos aese v2.16b, v26.16b 742 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 743 1.1 christos 744 1.1 christos aese v0.16b, v27.16b //AES block 4k+4 - round 9 745 1.1 christos 746 1.1 christos aese v1.16b, v27.16b //AES block 4k+5 - round 9 747 1.1 christos eor v11.16b, v11.16b, v10.16b 748 1.1 christos 749 1.1 christos aese v2.16b, v27.16b //AES block 4k+6 - round 9 750 1.1 christos .L128_enc_tail: //TAIL 751 1.1 christos 752 1.1 christos sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 753 1.1 christos ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 754 1.1 christos #ifdef __AARCH64EB__ 755 1.1 christos rev x6, x6 756 1.1 christos rev x7, x7 757 1.1 christos #endif 758 1.1 christos cmp x5, #48 759 1.1 christos 760 1.1 christos ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 761 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 10 low 762 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 10 high 763 1.1 christos 764 1.1 christos fmov d4, x6 //AES block 4k+4 - mov low 765 1.1 christos 766 1.1 christos fmov v4.d[1], x7 //AES block 4k+4 - mov high 767 1.1 christos 768 1.1 christos eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 769 1.1 christos 770 1.1 christos b.gt .L128_enc_blocks_more_than_3 771 1.1 christos 772 1.1 christos sub w12, w12, #1 773 1.1 christos movi v11.8b, #0 774 1.1 christos mov v3.16b, v2.16b 775 1.1 christos 776 1.1 christos cmp x5, #32 777 1.1 christos mov v2.16b, v1.16b 778 1.1 christos movi v9.8b, #0 779 1.1 christos 780 1.1 christos movi v10.8b, #0 781 1.1 christos b.gt .L128_enc_blocks_more_than_2 782 1.1 christos 783 1.1 christos mov v3.16b, v1.16b 784 1.1 christos cmp x5, #16 785 1.1 christos 786 1.1 christos sub w12, w12, #1 787 1.1 christos b.gt .L128_enc_blocks_more_than_1 788 1.1 christos 789 1.1 christos sub w12, w12, #1 790 1.1 christos b .L128_enc_blocks_less_than_1 791 1.1 christos .L128_enc_blocks_more_than_3: //blocks left > 3 792 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 793 1.1 christos 794 1.1 christos ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 795 1.1 christos #ifdef __AARCH64EB__ 796 1.1 christos rev x6, x6 797 1.1 christos rev x7, x7 798 1.1 christos #endif 799 1.1 christos rev64 v4.16b, v5.16b //GHASH final-3 block 800 1.1 christos 801 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 802 1.1 christos eor x7, x7, x14 //AES final-2 block - round 10 high 803 1.1 christos eor x6, x6, x13 //AES final-2 block - round 10 low 804 1.1 christos 805 1.1 christos fmov d5, x6 //AES final-2 block - mov low 806 1.1 christos 807 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 808 1.1 christos fmov v5.d[1], x7 //AES final-2 block - mov high 809 1.1 christos 810 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 811 1.1 christos mov d22, v4.d[1] //GHASH final-3 block - mid 812 1.1 christos 813 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 814 1.1 christos 815 1.1 christos mov d10, v17.d[1] //GHASH final-3 block - mid 816 1.1 christos 817 1.1 christos eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 818 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 819 1.1 christos 820 1.1 christos pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 821 1.1 christos .L128_enc_blocks_more_than_2: //blocks left > 2 822 1.1 christos 823 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 824 1.1 christos 825 1.1 christos rev64 v4.16b, v5.16b //GHASH final-2 block 826 1.1 christos ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 827 1.1 christos #ifdef __AARCH64EB__ 828 1.1 christos rev x6, x6 829 1.1 christos rev x7, x7 830 1.1 christos #endif 831 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 832 1.1 christos 833 1.1 christos eor x6, x6, x13 //AES final-1 block - round 10 low 834 1.1 christos 835 1.1 christos fmov d5, x6 //AES final-1 block - mov low 836 1.1 christos eor x7, x7, x14 //AES final-1 block - round 10 high 837 1.1 christos 838 1.1 christos pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 839 1.1 christos fmov v5.d[1], x7 //AES final-1 block - mov high 840 1.1 christos 841 1.1 christos mov d22, v4.d[1] //GHASH final-2 block - mid 842 1.1 christos 843 1.1 christos pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 844 1.1 christos 845 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 846 1.1 christos 847 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 848 1.1 christos 849 1.1 christos eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 850 1.1 christos 851 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 852 1.1 christos 853 1.1 christos pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 854 1.1 christos 855 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 856 1.1 christos 857 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 858 1.1 christos .L128_enc_blocks_more_than_1: //blocks left > 1 859 1.1 christos 860 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 861 1.1 christos 862 1.1 christos rev64 v4.16b, v5.16b //GHASH final-1 block 863 1.1 christos ldp x6, x7, [x0], #16 //AES final block - load input low & high 864 1.1 christos #ifdef __AARCH64EB__ 865 1.1 christos rev x6, x6 866 1.1 christos rev x7, x7 867 1.1 christos #endif 868 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 869 1.1 christos 870 1.1 christos eor x7, x7, x14 //AES final block - round 10 high 871 1.1 christos eor x6, x6, x13 //AES final block - round 10 low 872 1.1 christos 873 1.1 christos fmov d5, x6 //AES final block - mov low 874 1.1 christos 875 1.1 christos pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 876 1.1 christos fmov v5.d[1], x7 //AES final block - mov high 877 1.1 christos 878 1.1 christos mov d22, v4.d[1] //GHASH final-1 block - mid 879 1.1 christos 880 1.1 christos pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 881 1.1 christos 882 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 883 1.1 christos 884 1.1 christos eor v5.16b, v5.16b, v3.16b //AES final block - result 885 1.1 christos 886 1.1 christos ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 887 1.1 christos 888 1.1 christos pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 889 1.1 christos 890 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 891 1.1 christos 892 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 893 1.1 christos 894 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 895 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 896 1.1 christos .L128_enc_blocks_less_than_1: //blocks left <= 1 897 1.1 christos 898 1.1 christos and x1, x1, #127 //bit_length %= 128 899 1.1 christos mvn x13, xzr //rk10_l = 0xffffffffffffffff 900 1.1 christos 901 1.1 christos mvn x14, xzr //rk10_h = 0xffffffffffffffff 902 1.1 christos sub x1, x1, #128 //bit_length -= 128 903 1.1 christos 904 1.1 christos neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 905 1.1 christos 906 1.1 christos and x1, x1, #127 //bit_length %= 128 907 1.1 christos 908 1.1 christos lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 909 1.1 christos cmp x1, #64 910 1.1 christos 911 1.1 christos csel x6, x13, x14, lt 912 1.1 christos csel x7, x14, xzr, lt 913 1.1 christos 914 1.1 christos fmov d0, x6 //ctr0b is mask for last block 915 1.1 christos 916 1.1 christos fmov v0.d[1], x7 917 1.1 christos 918 1.1 christos and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 919 1.1 christos 920 1.1 christos rev64 v4.16b, v5.16b //GHASH final block 921 1.1 christos 922 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 923 1.1 christos 924 1.1 christos mov d8, v4.d[1] //GHASH final block - mid 925 1.1 christos 926 1.1 christos pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 927 1.1 christos ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 928 1.1 christos 929 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 930 1.1 christos #ifndef __AARCH64EB__ 931 1.1 christos rev w9, w12 932 1.1 christos #else 933 1.1 christos mov w9, w12 934 1.1 christos #endif 935 1.1 christos pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 936 1.1 christos 937 1.1 christos pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 938 1.1 christos 939 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final block - low 940 1.1 christos 941 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final block - high 942 1.1 christos 943 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 944 1.1 christos movi v8.8b, #0xc2 945 1.1 christos 946 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 947 1.1 christos 948 1.1 christos shl d8, d8, #56 //mod_constant 949 1.1 christos 950 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 951 1.1 christos 952 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 953 1.1 christos 954 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 955 1.1 christos 956 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 957 1.1 christos 958 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 959 1.1 christos 960 1.1 christos pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 961 1.1 christos 962 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 963 1.1 christos 964 1.1 christos bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 965 1.1 christos 966 1.1 christos eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 967 1.1 christos st1 { v5.16b}, [x2] //store all 16B 968 1.1 christos 969 1.1 christos str w9, [x16, #12] //store the updated counter 970 1.1 christos 971 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 972 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 973 1.1 christos rev64 v11.16b, v11.16b 974 1.1 christos mov x0, x15 975 1.1 christos st1 { v11.16b }, [x3] 976 1.1 christos ldp x21, x22, [sp, #16] 977 1.1 christos ldp x23, x24, [sp, #32] 978 1.1 christos ldp d8, d9, [sp, #48] 979 1.1 christos ldp d10, d11, [sp, #64] 980 1.1 christos ldp d12, d13, [sp, #80] 981 1.1 christos ldp d14, d15, [sp, #96] 982 1.1 christos ldp x19, x20, [sp], #112 983 1.1 christos ret 984 1.1 christos 985 1.1 christos .L128_enc_ret: 986 1.1 christos mov w0, #0x0 987 1.1 christos ret 988 1.1 christos .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 989 1.1 christos .globl aes_gcm_dec_128_kernel 990 1.1 christos .type aes_gcm_dec_128_kernel,%function 991 1.1 christos .align 4 992 1.1 christos aes_gcm_dec_128_kernel: 993 1.2 christos AARCH64_VALID_CALL_TARGET 994 1.1 christos cbz x1, .L128_dec_ret 995 1.1 christos stp x19, x20, [sp, #-112]! 996 1.1 christos mov x16, x4 997 1.1 christos mov x8, x5 998 1.1 christos stp x21, x22, [sp, #16] 999 1.1 christos stp x23, x24, [sp, #32] 1000 1.1 christos stp d8, d9, [sp, #48] 1001 1.1 christos stp d10, d11, [sp, #64] 1002 1.1 christos stp d12, d13, [sp, #80] 1003 1.1 christos stp d14, d15, [sp, #96] 1004 1.1 christos 1005 1.1 christos lsr x5, x1, #3 //byte_len 1006 1.1 christos mov x15, x5 1007 1.1 christos ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1008 1.1 christos #ifdef __AARCH64EB__ 1009 1.1 christos rev x10, x10 1010 1.1 christos rev x11, x11 1011 1.1 christos #endif 1012 1.1 christos ldp x13, x14, [x8, #160] //load rk10 1013 1.1 christos #ifdef __AARCH64EB__ 1014 1.1 christos ror x14, x14, 32 1015 1.1 christos ror x13, x13, 32 1016 1.1 christos #endif 1017 1.1 christos sub x5, x5, #1 //byte_len - 1 1018 1.1 christos ld1 {v18.4s}, [x8], #16 //load rk0 1019 1.1 christos 1020 1.1 christos and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1021 1.1 christos ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 1022 1.1 christos 1023 1.1 christos ldr q13, [x3, #64] //load h2l | h2h 1024 1.1 christos #ifndef __AARCH64EB__ 1025 1.1 christos ext v13.16b, v13.16b, v13.16b, #8 1026 1.1 christos #endif 1027 1.1 christos lsr x12, x11, #32 1028 1.1 christos fmov d2, x10 //CTR block 2 1029 1.1 christos 1030 1.1 christos ld1 {v19.4s}, [x8], #16 //load rk1 1031 1.1 christos orr w11, w11, w11 1032 1.1 christos rev w12, w12 //rev_ctr32 1033 1.1 christos 1034 1.1 christos fmov d1, x10 //CTR block 1 1035 1.1 christos add w12, w12, #1 //increment rev_ctr32 1036 1.1 christos 1037 1.1 christos aese v0.16b, v18.16b 1038 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 0 1039 1.1 christos rev w9, w12 //CTR block 1 1040 1.1 christos 1041 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 1 1042 1.1 christos ld1 {v20.4s}, [x8], #16 //load rk2 1043 1.1 christos add w12, w12, #1 //CTR block 1 1044 1.1 christos 1045 1.1 christos fmov v1.d[1], x9 //CTR block 1 1046 1.1 christos rev w9, w12 //CTR block 2 1047 1.1 christos add w12, w12, #1 //CTR block 2 1048 1.1 christos 1049 1.1 christos aese v0.16b, v19.16b 1050 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 1 1051 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 2 1052 1.1 christos 1053 1.1 christos fmov v2.d[1], x9 //CTR block 2 1054 1.1 christos rev w9, w12 //CTR block 3 1055 1.1 christos 1056 1.1 christos fmov d3, x10 //CTR block 3 1057 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 3 1058 1.1 christos add w12, w12, #1 //CTR block 3 1059 1.1 christos 1060 1.1 christos fmov v3.d[1], x9 //CTR block 3 1061 1.1 christos add x4, x0, x1, lsr #3 //end_input_ptr 1062 1.1 christos 1063 1.1 christos aese v1.16b, v18.16b 1064 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 0 1065 1.1 christos ld1 {v21.4s}, [x8], #16 //load rk3 1066 1.1 christos 1067 1.1 christos aese v0.16b, v20.16b 1068 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 2 1069 1.1 christos ld1 {v22.4s}, [x8], #16 //load rk4 1070 1.1 christos 1071 1.1 christos aese v2.16b, v18.16b 1072 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 0 1073 1.1 christos ld1 {v23.4s}, [x8], #16 //load rk5 1074 1.1 christos 1075 1.1 christos aese v1.16b, v19.16b 1076 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 1 1077 1.1 christos ld1 {v24.4s}, [x8], #16 //load rk6 1078 1.1 christos 1079 1.1 christos aese v3.16b, v18.16b 1080 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 0 1081 1.1 christos 1082 1.1 christos aese v2.16b, v19.16b 1083 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 1 1084 1.1 christos 1085 1.1 christos aese v1.16b, v20.16b 1086 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 2 1087 1.1 christos 1088 1.1 christos aese v3.16b, v19.16b 1089 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 1 1090 1.1 christos ld1 { v11.16b}, [x3] 1091 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 1092 1.1 christos rev64 v11.16b, v11.16b 1093 1.1 christos 1094 1.1 christos aese v0.16b, v21.16b 1095 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 3 1096 1.1 christos ld1 {v25.4s}, [x8], #16 //load rk7 1097 1.1 christos 1098 1.1 christos aese v1.16b, v21.16b 1099 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 3 1100 1.1 christos 1101 1.1 christos aese v3.16b, v20.16b 1102 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 2 1103 1.1 christos 1104 1.1 christos aese v2.16b, v20.16b 1105 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 2 1106 1.1 christos ld1 {v26.4s}, [x8], #16 //load rk8 1107 1.1 christos 1108 1.1 christos aese v1.16b, v22.16b 1109 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 4 1110 1.1 christos 1111 1.1 christos aese v3.16b, v21.16b 1112 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 3 1113 1.1 christos 1114 1.1 christos aese v2.16b, v21.16b 1115 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 3 1116 1.1 christos ldr q14, [x3, #80] //load h3l | h3h 1117 1.1 christos #ifndef __AARCH64EB__ 1118 1.1 christos ext v14.16b, v14.16b, v14.16b, #8 1119 1.1 christos #endif 1120 1.1 christos aese v0.16b, v22.16b 1121 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 4 1122 1.1 christos ld1 {v27.4s}, [x8], #16 //load rk9 1123 1.1 christos 1124 1.1 christos aese v1.16b, v23.16b 1125 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 5 1126 1.1 christos 1127 1.1 christos aese v2.16b, v22.16b 1128 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 4 1129 1.1 christos 1130 1.1 christos aese v3.16b, v22.16b 1131 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 4 1132 1.1 christos 1133 1.1 christos aese v0.16b, v23.16b 1134 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 5 1135 1.1 christos 1136 1.1 christos aese v2.16b, v23.16b 1137 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 5 1138 1.1 christos ldr q12, [x3, #32] //load h1l | h1h 1139 1.1 christos #ifndef __AARCH64EB__ 1140 1.1 christos ext v12.16b, v12.16b, v12.16b, #8 1141 1.1 christos #endif 1142 1.1 christos aese v3.16b, v23.16b 1143 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 5 1144 1.1 christos 1145 1.1 christos aese v0.16b, v24.16b 1146 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 6 1147 1.1 christos 1148 1.1 christos aese v1.16b, v24.16b 1149 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 6 1150 1.1 christos 1151 1.1 christos aese v3.16b, v24.16b 1152 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 6 1153 1.1 christos 1154 1.1 christos aese v2.16b, v24.16b 1155 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 6 1156 1.1 christos trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 1157 1.1 christos 1158 1.1 christos ldr q15, [x3, #112] //load h4l | h4h 1159 1.1 christos #ifndef __AARCH64EB__ 1160 1.1 christos ext v15.16b, v15.16b, v15.16b, #8 1161 1.1 christos #endif 1162 1.1 christos trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 1163 1.1 christos add x5, x5, x0 1164 1.1 christos 1165 1.1 christos aese v1.16b, v25.16b 1166 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 7 1167 1.1 christos 1168 1.1 christos aese v2.16b, v25.16b 1169 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 7 1170 1.1 christos 1171 1.1 christos aese v0.16b, v25.16b 1172 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 7 1173 1.1 christos eor v16.16b, v16.16b, v8.16b //h2k | h1k 1174 1.1 christos 1175 1.1 christos aese v3.16b, v25.16b 1176 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 7 1177 1.1 christos 1178 1.1 christos aese v1.16b, v26.16b 1179 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 8 1180 1.1 christos trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 1181 1.1 christos 1182 1.1 christos aese v2.16b, v26.16b 1183 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 8 1184 1.1 christos 1185 1.1 christos aese v3.16b, v26.16b 1186 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 8 1187 1.1 christos 1188 1.1 christos aese v0.16b, v26.16b 1189 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 8 1190 1.1 christos trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 1191 1.1 christos 1192 1.1 christos aese v2.16b, v27.16b //AES block 2 - round 9 1193 1.1 christos 1194 1.1 christos aese v3.16b, v27.16b //AES block 3 - round 9 1195 1.1 christos 1196 1.1 christos aese v0.16b, v27.16b //AES block 0 - round 9 1197 1.1 christos cmp x0, x5 //check if we have <= 4 blocks 1198 1.1 christos 1199 1.1 christos aese v1.16b, v27.16b //AES block 1 - round 9 1200 1.1 christos eor v17.16b, v17.16b, v9.16b //h4k | h3k 1201 1.1 christos b.ge .L128_dec_tail //handle tail 1202 1.1 christos 1203 1.1 christos ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext 1204 1.1 christos 1205 1.1 christos eor v1.16b, v5.16b, v1.16b //AES block 1 - result 1206 1.1 christos ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 1207 1.1 christos 1208 1.1 christos eor v0.16b, v4.16b, v0.16b //AES block 0 - result 1209 1.1 christos rev64 v4.16b, v4.16b //GHASH block 0 1210 1.1 christos rev w9, w12 //CTR block 4 1211 1.1 christos 1212 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4 1213 1.1 christos add w12, w12, #1 //CTR block 4 1214 1.1 christos ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 1215 1.1 christos 1216 1.1 christos rev64 v5.16b, v5.16b //GHASH block 1 1217 1.1 christos mov x19, v1.d[0] //AES block 1 - mov low 1218 1.1 christos 1219 1.1 christos mov x20, v1.d[1] //AES block 1 - mov high 1220 1.1 christos 1221 1.1 christos mov x6, v0.d[0] //AES block 0 - mov low 1222 1.1 christos cmp x0, x5 //check if we have <= 8 blocks 1223 1.1 christos 1224 1.1 christos mov x7, v0.d[1] //AES block 0 - mov high 1225 1.1 christos 1226 1.1 christos fmov d0, x10 //CTR block 4 1227 1.1 christos 1228 1.1 christos fmov v0.d[1], x9 //CTR block 4 1229 1.1 christos rev w9, w12 //CTR block 5 1230 1.1 christos eor x19, x19, x13 //AES block 1 - round 10 low 1231 1.1 christos #ifdef __AARCH64EB__ 1232 1.1 christos rev x19, x19 1233 1.1 christos #endif 1234 1.1 christos fmov d1, x10 //CTR block 5 1235 1.1 christos add w12, w12, #1 //CTR block 5 1236 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 5 1237 1.1 christos 1238 1.1 christos fmov v1.d[1], x9 //CTR block 5 1239 1.1 christos rev w9, w12 //CTR block 6 1240 1.1 christos add w12, w12, #1 //CTR block 6 1241 1.1 christos 1242 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 6 1243 1.1 christos 1244 1.1 christos eor x20, x20, x14 //AES block 1 - round 10 high 1245 1.1 christos #ifdef __AARCH64EB__ 1246 1.1 christos rev x20, x20 1247 1.1 christos #endif 1248 1.1 christos eor x6, x6, x13 //AES block 0 - round 10 low 1249 1.1 christos #ifdef __AARCH64EB__ 1250 1.1 christos rev x6, x6 1251 1.1 christos #endif 1252 1.1 christos eor v2.16b, v6.16b, v2.16b //AES block 2 - result 1253 1.1 christos 1254 1.1 christos eor x7, x7, x14 //AES block 0 - round 10 high 1255 1.1 christos #ifdef __AARCH64EB__ 1256 1.1 christos rev x7, x7 1257 1.1 christos #endif 1258 1.1 christos stp x6, x7, [x2], #16 //AES block 0 - store result 1259 1.1 christos 1260 1.1 christos stp x19, x20, [x2], #16 //AES block 1 - store result 1261 1.1 christos b.ge .L128_dec_prepretail //do prepretail 1262 1.1 christos 1263 1.1 christos .L128_dec_main_loop: //main loop start 1264 1.1 christos eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1265 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1266 1.1 christos mov x21, v2.d[0] //AES block 4k+2 - mov low 1267 1.1 christos 1268 1.1 christos pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1269 1.1 christos mov x22, v2.d[1] //AES block 4k+2 - mov high 1270 1.1 christos 1271 1.1 christos aese v1.16b, v18.16b 1272 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1273 1.1 christos fmov d2, x10 //CTR block 4k+6 1274 1.1 christos 1275 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 1276 1.1 christos fmov v2.d[1], x9 //CTR block 4k+6 1277 1.1 christos rev w9, w12 //CTR block 4k+7 1278 1.1 christos 1279 1.1 christos mov x23, v3.d[0] //AES block 4k+3 - mov low 1280 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 1281 1.1 christos mov d30, v5.d[1] //GHASH block 4k+1 - mid 1282 1.1 christos 1283 1.1 christos aese v1.16b, v19.16b 1284 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1285 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 1286 1.1 christos 1287 1.1 christos pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1288 1.1 christos mov x24, v3.d[1] //AES block 4k+3 - mov high 1289 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+7 1290 1.1 christos 1291 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1292 1.1 christos fmov d3, x10 //CTR block 4k+7 1293 1.1 christos eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1294 1.1 christos 1295 1.1 christos aese v1.16b, v20.16b 1296 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1297 1.1 christos fmov v3.d[1], x9 //CTR block 4k+7 1298 1.1 christos 1299 1.1 christos aese v2.16b, v18.16b 1300 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1301 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 1302 1.1 christos 1303 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1304 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1305 1.1 christos 1306 1.1 christos pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1307 1.1 christos 1308 1.1 christos aese v1.16b, v21.16b 1309 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1310 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 1311 1.1 christos 1312 1.1 christos aese v3.16b, v18.16b 1313 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1314 1.1 christos eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1315 1.1 christos 1316 1.1 christos aese v0.16b, v18.16b 1317 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1318 1.1 christos 1319 1.1 christos pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1320 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1321 1.1 christos 1322 1.1 christos aese v3.16b, v19.16b 1323 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1324 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 10 low 1325 1.1 christos #ifdef __AARCH64EB__ 1326 1.1 christos rev x23, x23 1327 1.1 christos #endif 1328 1.1 christos pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1329 1.1 christos eor x22, x22, x14 //AES block 4k+2 - round 10 high 1330 1.1 christos #ifdef __AARCH64EB__ 1331 1.1 christos rev x22, x22 1332 1.1 christos #endif 1333 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 1334 1.1 christos 1335 1.1 christos aese v0.16b, v19.16b 1336 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1337 1.1 christos eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1338 1.1 christos 1339 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1340 1.1 christos 1341 1.1 christos aese v3.16b, v20.16b 1342 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1343 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1344 1.1 christos 1345 1.1 christos aese v0.16b, v20.16b 1346 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1347 1.1 christos 1348 1.1 christos aese v1.16b, v22.16b 1349 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1350 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1351 1.1 christos 1352 1.1 christos pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1353 1.1 christos 1354 1.1 christos aese v0.16b, v21.16b 1355 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1356 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1357 1.1 christos 1358 1.1 christos pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1359 1.1 christos 1360 1.1 christos aese v2.16b, v19.16b 1361 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1362 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 1363 1.1 christos 1364 1.1 christos aese v0.16b, v22.16b 1365 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1366 1.1 christos eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1367 1.1 christos 1368 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1369 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 10 high 1370 1.1 christos #ifdef __AARCH64EB__ 1371 1.1 christos rev x24, x24 1372 1.1 christos #endif 1373 1.1 christos aese v2.16b, v20.16b 1374 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1375 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1376 1.1 christos 1377 1.1 christos aese v1.16b, v23.16b 1378 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1379 1.1 christos eor x21, x21, x13 //AES block 4k+2 - round 10 low 1380 1.1 christos #ifdef __AARCH64EB__ 1381 1.1 christos rev x21, x21 1382 1.1 christos #endif 1383 1.1 christos aese v0.16b, v23.16b 1384 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1385 1.1 christos movi v8.8b, #0xc2 1386 1.1 christos 1387 1.1 christos aese v2.16b, v21.16b 1388 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1389 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1390 1.1 christos 1391 1.1 christos aese v1.16b, v24.16b 1392 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1393 1.1 christos 1394 1.1 christos aese v0.16b, v24.16b 1395 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1396 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1397 1.1 christos 1398 1.1 christos aese v2.16b, v22.16b 1399 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1400 1.1 christos stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1401 1.1 christos 1402 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1403 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1404 1.1 christos ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext 1405 1.1 christos 1406 1.1 christos aese v1.16b, v25.16b 1407 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1408 1.1 christos add w12, w12, #1 //CTR block 4k+7 1409 1.1 christos 1410 1.1 christos aese v0.16b, v25.16b 1411 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1412 1.1 christos shl d8, d8, #56 //mod_constant 1413 1.1 christos 1414 1.1 christos aese v2.16b, v23.16b 1415 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1416 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1417 1.1 christos 1418 1.1 christos aese v1.16b, v26.16b 1419 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1420 1.1 christos stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1421 1.1 christos 1422 1.1 christos aese v0.16b, v26.16b 1423 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1424 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1425 1.1 christos 1426 1.1 christos aese v3.16b, v21.16b 1427 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1428 1.1 christos rev w9, w12 //CTR block 4k+8 1429 1.1 christos 1430 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1431 1.1 christos ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1432 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1433 1.1 christos 1434 1.1 christos aese v0.16b, v27.16b //AES block 4k+4 - round 9 1435 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+8 1436 1.1 christos 1437 1.1 christos aese v3.16b, v22.16b 1438 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1439 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1440 1.1 christos 1441 1.1 christos aese v1.16b, v27.16b //AES block 4k+5 - round 9 1442 1.1 christos 1443 1.1 christos aese v2.16b, v24.16b 1444 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1445 1.1 christos eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 1446 1.1 christos 1447 1.1 christos aese v3.16b, v23.16b 1448 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1449 1.1 christos ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 1450 1.1 christos 1451 1.1 christos add w12, w12, #1 //CTR block 4k+8 1452 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1453 1.1 christos eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 1454 1.1 christos 1455 1.1 christos aese v2.16b, v25.16b 1456 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1457 1.1 christos ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 1458 1.1 christos 1459 1.1 christos aese v3.16b, v24.16b 1460 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1461 1.1 christos 1462 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+5 1463 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1464 1.1 christos mov x7, v0.d[1] //AES block 4k+4 - mov high 1465 1.1 christos 1466 1.1 christos aese v2.16b, v26.16b 1467 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1468 1.1 christos mov x6, v0.d[0] //AES block 4k+4 - mov low 1469 1.1 christos 1470 1.1 christos aese v3.16b, v25.16b 1471 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1472 1.1 christos fmov d0, x10 //CTR block 4k+8 1473 1.1 christos 1474 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1475 1.1 christos fmov v0.d[1], x9 //CTR block 4k+8 1476 1.1 christos rev w9, w12 //CTR block 4k+9 1477 1.1 christos 1478 1.1 christos aese v2.16b, v27.16b //AES block 4k+6 - round 9 1479 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+9 1480 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1481 1.1 christos 1482 1.1 christos aese v3.16b, v26.16b 1483 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1484 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 10 high 1485 1.1 christos #ifdef __AARCH64EB__ 1486 1.1 christos rev x7, x7 1487 1.1 christos #endif 1488 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1489 1.1 christos mov x20, v1.d[1] //AES block 4k+5 - mov high 1490 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 10 low 1491 1.1 christos #ifdef __AARCH64EB__ 1492 1.1 christos rev x6, x6 1493 1.1 christos #endif 1494 1.1 christos eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 1495 1.1 christos mov x19, v1.d[0] //AES block 4k+5 - mov low 1496 1.1 christos add w12, w12, #1 //CTR block 4k+9 1497 1.1 christos 1498 1.1 christos aese v3.16b, v27.16b //AES block 4k+7 - round 9 1499 1.1 christos fmov d1, x10 //CTR block 4k+9 1500 1.1 christos cmp x0, x5 //.LOOP CONTROL 1501 1.1 christos 1502 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k+4 1503 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1504 1.1 christos fmov v1.d[1], x9 //CTR block 4k+9 1505 1.1 christos 1506 1.1 christos rev w9, w12 //CTR block 4k+10 1507 1.1 christos add w12, w12, #1 //CTR block 4k+10 1508 1.1 christos 1509 1.1 christos eor x20, x20, x14 //AES block 4k+5 - round 10 high 1510 1.1 christos #ifdef __AARCH64EB__ 1511 1.1 christos rev x20, x20 1512 1.1 christos #endif 1513 1.1 christos stp x6, x7, [x2], #16 //AES block 4k+4 - store result 1514 1.1 christos 1515 1.1 christos eor x19, x19, x13 //AES block 4k+5 - round 10 low 1516 1.1 christos #ifdef __AARCH64EB__ 1517 1.1 christos rev x19, x19 1518 1.1 christos #endif 1519 1.1 christos stp x19, x20, [x2], #16 //AES block 4k+5 - store result 1520 1.1 christos 1521 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+10 1522 1.1 christos b.lt .L128_dec_main_loop 1523 1.1 christos 1524 1.1 christos .L128_dec_prepretail: //PREPRETAIL 1525 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1526 1.1 christos mov x21, v2.d[0] //AES block 4k+2 - mov low 1527 1.1 christos mov d30, v5.d[1] //GHASH block 4k+1 - mid 1528 1.1 christos 1529 1.1 christos aese v0.16b, v18.16b 1530 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1531 1.1 christos eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1532 1.1 christos 1533 1.1 christos aese v1.16b, v18.16b 1534 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1535 1.1 christos mov x22, v2.d[1] //AES block 4k+2 - mov high 1536 1.1 christos 1537 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 1538 1.1 christos fmov d2, x10 //CTR block 4k+6 1539 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 1540 1.1 christos 1541 1.1 christos aese v0.16b, v19.16b 1542 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1543 1.1 christos fmov v2.d[1], x9 //CTR block 4k+6 1544 1.1 christos 1545 1.1 christos rev w9, w12 //CTR block 4k+7 1546 1.1 christos mov x23, v3.d[0] //AES block 4k+3 - mov low 1547 1.1 christos eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1548 1.1 christos 1549 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1550 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 1551 1.1 christos mov x24, v3.d[1] //AES block 4k+3 - mov high 1552 1.1 christos 1553 1.1 christos aese v1.16b, v19.16b 1554 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1555 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 1556 1.1 christos 1557 1.1 christos aese v0.16b, v20.16b 1558 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1559 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+7 1560 1.1 christos 1561 1.1 christos pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1562 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 1563 1.1 christos fmov d3, x10 //CTR block 4k+7 1564 1.1 christos 1565 1.1 christos aese v2.16b, v18.16b 1566 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1567 1.1 christos fmov v3.d[1], x9 //CTR block 4k+7 1568 1.1 christos 1569 1.1 christos pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1570 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1571 1.1 christos 1572 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 1573 1.1 christos 1574 1.1 christos aese v2.16b, v19.16b 1575 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1576 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1577 1.1 christos 1578 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1579 1.1 christos 1580 1.1 christos aese v3.16b, v18.16b 1581 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1582 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1583 1.1 christos 1584 1.1 christos pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1585 1.1 christos 1586 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1587 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1588 1.1 christos 1589 1.1 christos pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1590 1.1 christos 1591 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1592 1.1 christos eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1593 1.1 christos 1594 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1595 1.1 christos 1596 1.1 christos pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1597 1.1 christos 1598 1.1 christos pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1599 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 1600 1.1 christos 1601 1.1 christos aese v1.16b, v20.16b 1602 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1603 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1604 1.1 christos 1605 1.1 christos pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1606 1.1 christos 1607 1.1 christos eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1608 1.1 christos movi v8.8b, #0xc2 1609 1.1 christos 1610 1.1 christos aese v3.16b, v19.16b 1611 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1612 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1613 1.1 christos 1614 1.1 christos eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1615 1.1 christos 1616 1.1 christos aese v2.16b, v20.16b 1617 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1618 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1619 1.1 christos 1620 1.1 christos aese v3.16b, v20.16b 1621 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1622 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 10 low 1623 1.1 christos #ifdef __AARCH64EB__ 1624 1.1 christos rev x23, x23 1625 1.1 christos #endif 1626 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1627 1.1 christos eor x21, x21, x13 //AES block 4k+2 - round 10 low 1628 1.1 christos #ifdef __AARCH64EB__ 1629 1.1 christos rev x21, x21 1630 1.1 christos #endif 1631 1.1 christos eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1632 1.1 christos 1633 1.1 christos aese v2.16b, v21.16b 1634 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1635 1.1 christos 1636 1.1 christos aese v1.16b, v21.16b 1637 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1638 1.1 christos shl d8, d8, #56 //mod_constant 1639 1.1 christos 1640 1.1 christos aese v0.16b, v21.16b 1641 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1642 1.1 christos 1643 1.1 christos aese v2.16b, v22.16b 1644 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1645 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1646 1.1 christos 1647 1.1 christos aese v1.16b, v22.16b 1648 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1649 1.1 christos 1650 1.1 christos aese v3.16b, v21.16b 1651 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1652 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1653 1.1 christos 1654 1.1 christos aese v2.16b, v23.16b 1655 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1656 1.1 christos 1657 1.1 christos aese v1.16b, v23.16b 1658 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1659 1.1 christos 1660 1.1 christos aese v3.16b, v22.16b 1661 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1662 1.1 christos 1663 1.1 christos aese v0.16b, v22.16b 1664 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1665 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1666 1.1 christos 1667 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1668 1.1 christos 1669 1.1 christos aese v1.16b, v24.16b 1670 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1671 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1672 1.1 christos 1673 1.1 christos aese v3.16b, v23.16b 1674 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1675 1.1 christos 1676 1.1 christos aese v0.16b, v23.16b 1677 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1678 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1679 1.1 christos 1680 1.1 christos aese v1.16b, v25.16b 1681 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1682 1.1 christos 1683 1.1 christos aese v2.16b, v24.16b 1684 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1685 1.1 christos 1686 1.1 christos aese v0.16b, v24.16b 1687 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1688 1.1 christos 1689 1.1 christos aese v1.16b, v26.16b 1690 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1691 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1692 1.1 christos 1693 1.1 christos aese v3.16b, v24.16b 1694 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1695 1.1 christos 1696 1.1 christos aese v0.16b, v25.16b 1697 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1698 1.1 christos 1699 1.1 christos aese v1.16b, v27.16b //AES block 4k+5 - round 9 1700 1.1 christos 1701 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1702 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 10 high 1703 1.1 christos #ifdef __AARCH64EB__ 1704 1.1 christos rev x24, x24 1705 1.1 christos #endif 1706 1.1 christos aese v2.16b, v25.16b 1707 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1708 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1709 1.1 christos 1710 1.1 christos aese v3.16b, v25.16b 1711 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1712 1.1 christos 1713 1.1 christos aese v0.16b, v26.16b 1714 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1715 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1716 1.1 christos 1717 1.1 christos aese v2.16b, v26.16b 1718 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1719 1.1 christos 1720 1.1 christos aese v3.16b, v26.16b 1721 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1722 1.1 christos eor x22, x22, x14 //AES block 4k+2 - round 10 high 1723 1.1 christos #ifdef __AARCH64EB__ 1724 1.1 christos rev x22, x22 1725 1.1 christos #endif 1726 1.1 christos aese v0.16b, v27.16b //AES block 4k+4 - round 9 1727 1.1 christos stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1728 1.1 christos 1729 1.1 christos aese v2.16b, v27.16b //AES block 4k+6 - round 9 1730 1.1 christos add w12, w12, #1 //CTR block 4k+7 1731 1.1 christos stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1732 1.1 christos 1733 1.1 christos aese v3.16b, v27.16b //AES block 4k+7 - round 9 1734 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1735 1.1 christos .L128_dec_tail: //TAIL 1736 1.1 christos 1737 1.1 christos sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 1738 1.1 christos ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1739 1.1 christos 1740 1.1 christos eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 1741 1.1 christos 1742 1.1 christos mov x7, v0.d[1] //AES block 4k+4 - mov high 1743 1.1 christos 1744 1.1 christos mov x6, v0.d[0] //AES block 4k+4 - mov low 1745 1.1 christos 1746 1.1 christos cmp x5, #48 1747 1.1 christos 1748 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 10 high 1749 1.1 christos #ifdef __AARCH64EB__ 1750 1.1 christos rev x7, x7 1751 1.1 christos #endif 1752 1.1 christos ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 1753 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 10 low 1754 1.1 christos #ifdef __AARCH64EB__ 1755 1.1 christos rev x6, x6 1756 1.1 christos #endif 1757 1.1 christos b.gt .L128_dec_blocks_more_than_3 1758 1.1 christos 1759 1.1 christos mov v3.16b, v2.16b 1760 1.1 christos sub w12, w12, #1 1761 1.1 christos movi v11.8b, #0 1762 1.1 christos 1763 1.1 christos movi v9.8b, #0 1764 1.1 christos mov v2.16b, v1.16b 1765 1.1 christos 1766 1.1 christos movi v10.8b, #0 1767 1.1 christos cmp x5, #32 1768 1.1 christos b.gt .L128_dec_blocks_more_than_2 1769 1.1 christos 1770 1.1 christos cmp x5, #16 1771 1.1 christos 1772 1.1 christos mov v3.16b, v1.16b 1773 1.1 christos sub w12, w12, #1 1774 1.1 christos b.gt .L128_dec_blocks_more_than_1 1775 1.1 christos 1776 1.1 christos sub w12, w12, #1 1777 1.1 christos b .L128_dec_blocks_less_than_1 1778 1.1 christos .L128_dec_blocks_more_than_3: //blocks left > 3 1779 1.1 christos rev64 v4.16b, v5.16b //GHASH final-3 block 1780 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 1781 1.1 christos 1782 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 1783 1.1 christos 1784 1.1 christos mov d10, v17.d[1] //GHASH final-3 block - mid 1785 1.1 christos stp x6, x7, [x2], #16 //AES final-3 block - store result 1786 1.1 christos eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 1787 1.1 christos 1788 1.1 christos mov d22, v4.d[1] //GHASH final-3 block - mid 1789 1.1 christos mov x7, v0.d[1] //AES final-2 block - mov high 1790 1.1 christos 1791 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 1792 1.1 christos mov x6, v0.d[0] //AES final-2 block - mov low 1793 1.1 christos 1794 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 1795 1.1 christos 1796 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 1797 1.1 christos 1798 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 1799 1.1 christos eor x7, x7, x14 //AES final-2 block - round 10 high 1800 1.1 christos #ifdef __AARCH64EB__ 1801 1.1 christos rev x7, x7 1802 1.1 christos #endif 1803 1.1 christos pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 1804 1.1 christos eor x6, x6, x13 //AES final-2 block - round 10 low 1805 1.1 christos #ifdef __AARCH64EB__ 1806 1.1 christos rev x6, x6 1807 1.1 christos #endif 1808 1.1 christos .L128_dec_blocks_more_than_2: //blocks left > 2 1809 1.1 christos 1810 1.1 christos rev64 v4.16b, v5.16b //GHASH final-2 block 1811 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 1812 1.1 christos 1813 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 1814 1.1 christos 1815 1.1 christos eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 1816 1.1 christos stp x6, x7, [x2], #16 //AES final-2 block - store result 1817 1.1 christos 1818 1.1 christos mov d22, v4.d[1] //GHASH final-2 block - mid 1819 1.1 christos 1820 1.1 christos pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 1821 1.1 christos 1822 1.1 christos pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 1823 1.1 christos mov x6, v0.d[0] //AES final-1 block - mov low 1824 1.1 christos 1825 1.1 christos mov x7, v0.d[1] //AES final-1 block - mov high 1826 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 1827 1.1 christos 1828 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 1829 1.1 christos 1830 1.1 christos pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 1831 1.1 christos 1832 1.1 christos eor x6, x6, x13 //AES final-1 block - round 10 low 1833 1.1 christos #ifdef __AARCH64EB__ 1834 1.1 christos rev x6, x6 1835 1.1 christos #endif 1836 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 1837 1.1 christos 1838 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 1839 1.1 christos 1840 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 1841 1.1 christos eor x7, x7, x14 //AES final-1 block - round 10 high 1842 1.1 christos #ifdef __AARCH64EB__ 1843 1.1 christos rev x7, x7 1844 1.1 christos #endif 1845 1.1 christos .L128_dec_blocks_more_than_1: //blocks left > 1 1846 1.1 christos 1847 1.1 christos rev64 v4.16b, v5.16b //GHASH final-1 block 1848 1.1 christos 1849 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 1850 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 1851 1.1 christos 1852 1.1 christos mov d22, v4.d[1] //GHASH final-1 block - mid 1853 1.1 christos 1854 1.1 christos eor v0.16b, v5.16b, v3.16b //AES final block - result 1855 1.1 christos 1856 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 1857 1.1 christos 1858 1.1 christos stp x6, x7, [x2], #16 //AES final-1 block - store result 1859 1.1 christos mov x6, v0.d[0] //AES final block - mov low 1860 1.1 christos 1861 1.1 christos mov x7, v0.d[1] //AES final block - mov high 1862 1.1 christos ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 1863 1.1 christos 1864 1.1 christos pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 1865 1.1 christos 1866 1.1 christos pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 1867 1.1 christos 1868 1.1 christos pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 1869 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 1870 1.1 christos 1871 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 1872 1.1 christos 1873 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 1874 1.1 christos eor x7, x7, x14 //AES final block - round 10 high 1875 1.1 christos #ifdef __AARCH64EB__ 1876 1.1 christos rev x7, x7 1877 1.1 christos #endif 1878 1.1 christos eor x6, x6, x13 //AES final block - round 10 low 1879 1.1 christos #ifdef __AARCH64EB__ 1880 1.1 christos rev x6, x6 1881 1.1 christos #endif 1882 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 1883 1.1 christos .L128_dec_blocks_less_than_1: //blocks left <= 1 1884 1.1 christos 1885 1.1 christos mvn x14, xzr //rk10_h = 0xffffffffffffffff 1886 1.1 christos and x1, x1, #127 //bit_length %= 128 1887 1.1 christos 1888 1.1 christos mvn x13, xzr //rk10_l = 0xffffffffffffffff 1889 1.1 christos sub x1, x1, #128 //bit_length -= 128 1890 1.1 christos 1891 1.1 christos neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 1892 1.1 christos 1893 1.1 christos and x1, x1, #127 //bit_length %= 128 1894 1.1 christos 1895 1.1 christos lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 1896 1.1 christos cmp x1, #64 1897 1.1 christos 1898 1.1 christos csel x10, x14, xzr, lt 1899 1.1 christos csel x9, x13, x14, lt 1900 1.1 christos 1901 1.1 christos fmov d0, x9 //ctr0b is mask for last block 1902 1.1 christos 1903 1.1 christos mov v0.d[1], x10 1904 1.1 christos 1905 1.1 christos and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 1906 1.1 christos 1907 1.1 christos rev64 v4.16b, v5.16b //GHASH final block 1908 1.1 christos 1909 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 1910 1.1 christos 1911 1.1 christos ldp x4, x5, [x2] //load existing bytes we need to not overwrite 1912 1.1 christos 1913 1.1 christos and x7, x7, x10 1914 1.1 christos 1915 1.1 christos pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 1916 1.1 christos mov d8, v4.d[1] //GHASH final block - mid 1917 1.1 christos 1918 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 1919 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final block - high 1920 1.1 christos 1921 1.1 christos pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 1922 1.1 christos 1923 1.1 christos pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 1924 1.1 christos bic x4, x4, x9 //mask out low existing bytes 1925 1.1 christos and x6, x6, x9 1926 1.1 christos 1927 1.1 christos #ifndef __AARCH64EB__ 1928 1.1 christos rev w9, w12 1929 1.1 christos #else 1930 1.1 christos mov w9, w12 1931 1.1 christos #endif 1932 1.1 christos 1933 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 1934 1.1 christos movi v8.8b, #0xc2 1935 1.1 christos 1936 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final block - low 1937 1.1 christos 1938 1.1 christos bic x5, x5, x10 //mask out high existing bytes 1939 1.1 christos shl d8, d8, #56 //mod_constant 1940 1.1 christos 1941 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1942 1.1 christos 1943 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1944 1.1 christos 1945 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1946 1.1 christos 1947 1.1 christos orr x6, x6, x4 1948 1.1 christos str w9, [x16, #12] //store the updated counter 1949 1.1 christos 1950 1.1 christos orr x7, x7, x5 1951 1.1 christos stp x6, x7, [x2] 1952 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1953 1.1 christos 1954 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1955 1.1 christos 1956 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1957 1.1 christos 1958 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1959 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1960 1.1 christos 1961 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1962 1.1 christos 1963 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1964 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 1965 1.1 christos rev64 v11.16b, v11.16b 1966 1.1 christos mov x0, x15 1967 1.1 christos st1 { v11.16b }, [x3] 1968 1.1 christos 1969 1.1 christos ldp x21, x22, [sp, #16] 1970 1.1 christos ldp x23, x24, [sp, #32] 1971 1.1 christos ldp d8, d9, [sp, #48] 1972 1.1 christos ldp d10, d11, [sp, #64] 1973 1.1 christos ldp d12, d13, [sp, #80] 1974 1.1 christos ldp d14, d15, [sp, #96] 1975 1.1 christos ldp x19, x20, [sp], #112 1976 1.1 christos ret 1977 1.1 christos 1978 1.1 christos .L128_dec_ret: 1979 1.1 christos mov w0, #0x0 1980 1.1 christos ret 1981 1.1 christos .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 1982 1.1 christos .globl aes_gcm_enc_192_kernel 1983 1.1 christos .type aes_gcm_enc_192_kernel,%function 1984 1.1 christos .align 4 1985 1.1 christos aes_gcm_enc_192_kernel: 1986 1.2 christos AARCH64_VALID_CALL_TARGET 1987 1.1 christos cbz x1, .L192_enc_ret 1988 1.1 christos stp x19, x20, [sp, #-112]! 1989 1.1 christos mov x16, x4 1990 1.1 christos mov x8, x5 1991 1.1 christos stp x21, x22, [sp, #16] 1992 1.1 christos stp x23, x24, [sp, #32] 1993 1.1 christos stp d8, d9, [sp, #48] 1994 1.1 christos stp d10, d11, [sp, #64] 1995 1.1 christos stp d12, d13, [sp, #80] 1996 1.1 christos stp d14, d15, [sp, #96] 1997 1.1 christos 1998 1.1 christos ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1999 1.1 christos #ifdef __AARCH64EB__ 2000 1.1 christos rev x10, x10 2001 1.1 christos rev x11, x11 2002 1.1 christos #endif 2003 1.1 christos ldp x13, x14, [x8, #192] //load rk12 2004 1.1 christos #ifdef __AARCH64EB__ 2005 1.1 christos ror x13, x13, #32 2006 1.1 christos ror x14, x14, #32 2007 1.1 christos #endif 2008 1.1 christos ld1 {v18.4s}, [x8], #16 //load rk0 2009 1.1 christos 2010 1.1 christos ld1 {v19.4s}, [x8], #16 //load rk1 2011 1.1 christos 2012 1.1 christos ld1 {v20.4s}, [x8], #16 //load rk2 2013 1.1 christos 2014 1.1 christos lsr x12, x11, #32 2015 1.1 christos ld1 {v21.4s}, [x8], #16 //load rk3 2016 1.1 christos orr w11, w11, w11 2017 1.1 christos 2018 1.1 christos ld1 {v22.4s}, [x8], #16 //load rk4 2019 1.1 christos rev w12, w12 //rev_ctr32 2020 1.1 christos 2021 1.1 christos add w12, w12, #1 //increment rev_ctr32 2022 1.1 christos fmov d3, x10 //CTR block 3 2023 1.1 christos 2024 1.1 christos rev w9, w12 //CTR block 1 2025 1.1 christos add w12, w12, #1 //CTR block 1 2026 1.1 christos fmov d1, x10 //CTR block 1 2027 1.1 christos 2028 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 1 2029 1.1 christos ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 2030 1.1 christos 2031 1.1 christos fmov v1.d[1], x9 //CTR block 1 2032 1.1 christos rev w9, w12 //CTR block 2 2033 1.1 christos add w12, w12, #1 //CTR block 2 2034 1.1 christos 2035 1.1 christos fmov d2, x10 //CTR block 2 2036 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 2 2037 1.1 christos 2038 1.1 christos fmov v2.d[1], x9 //CTR block 2 2039 1.1 christos rev w9, w12 //CTR block 3 2040 1.1 christos 2041 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 3 2042 1.1 christos ld1 {v23.4s}, [x8], #16 //load rk5 2043 1.1 christos 2044 1.1 christos fmov v3.d[1], x9 //CTR block 3 2045 1.1 christos 2046 1.1 christos ld1 {v24.4s}, [x8], #16 //load rk6 2047 1.1 christos 2048 1.1 christos ld1 {v25.4s}, [x8], #16 //load rk7 2049 1.1 christos 2050 1.1 christos aese v0.16b, v18.16b 2051 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 0 2052 1.1 christos ld1 { v11.16b}, [x3] 2053 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 2054 1.1 christos rev64 v11.16b, v11.16b 2055 1.1 christos 2056 1.1 christos aese v3.16b, v18.16b 2057 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 0 2058 1.1 christos ld1 {v26.4s}, [x8], #16 //load rk8 2059 1.1 christos 2060 1.1 christos aese v1.16b, v18.16b 2061 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 0 2062 1.1 christos ldr q15, [x3, #112] //load h4l | h4h 2063 1.1 christos #ifndef __AARCH64EB__ 2064 1.1 christos ext v15.16b, v15.16b, v15.16b, #8 2065 1.1 christos #endif 2066 1.1 christos aese v2.16b, v18.16b 2067 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 0 2068 1.1 christos ld1 {v27.4s}, [x8], #16 //load rk9 2069 1.1 christos 2070 1.1 christos aese v0.16b, v19.16b 2071 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 1 2072 1.1 christos ld1 {v28.4s}, [x8], #16 //load rk10 2073 1.1 christos 2074 1.1 christos aese v1.16b, v19.16b 2075 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 1 2076 1.1 christos ldr q12, [x3, #32] //load h1l | h1h 2077 1.1 christos #ifndef __AARCH64EB__ 2078 1.1 christos ext v12.16b, v12.16b, v12.16b, #8 2079 1.1 christos #endif 2080 1.1 christos aese v2.16b, v19.16b 2081 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 1 2082 1.1 christos ld1 {v29.4s}, [x8], #16 //load rk11 2083 1.1 christos 2084 1.1 christos aese v3.16b, v19.16b 2085 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 1 2086 1.1 christos ldr q14, [x3, #80] //load h3l | h3h 2087 1.1 christos #ifndef __AARCH64EB__ 2088 1.1 christos ext v14.16b, v14.16b, v14.16b, #8 2089 1.1 christos #endif 2090 1.1 christos aese v0.16b, v20.16b 2091 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 2 2092 1.1 christos 2093 1.1 christos aese v2.16b, v20.16b 2094 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 2 2095 1.1 christos 2096 1.1 christos aese v3.16b, v20.16b 2097 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 2 2098 1.1 christos 2099 1.1 christos aese v0.16b, v21.16b 2100 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 3 2101 1.1 christos trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 2102 1.1 christos 2103 1.1 christos aese v2.16b, v21.16b 2104 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 3 2105 1.1 christos 2106 1.1 christos aese v1.16b, v20.16b 2107 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 2 2108 1.1 christos trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 2109 1.1 christos 2110 1.1 christos aese v0.16b, v22.16b 2111 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 4 2112 1.1 christos 2113 1.1 christos aese v3.16b, v21.16b 2114 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 3 2115 1.1 christos 2116 1.1 christos aese v1.16b, v21.16b 2117 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 3 2118 1.1 christos 2119 1.1 christos aese v0.16b, v23.16b 2120 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 5 2121 1.1 christos 2122 1.1 christos aese v2.16b, v22.16b 2123 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 4 2124 1.1 christos 2125 1.1 christos aese v1.16b, v22.16b 2126 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 4 2127 1.1 christos 2128 1.1 christos aese v0.16b, v24.16b 2129 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 6 2130 1.1 christos 2131 1.1 christos aese v3.16b, v22.16b 2132 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 4 2133 1.1 christos 2134 1.1 christos aese v2.16b, v23.16b 2135 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 5 2136 1.1 christos 2137 1.1 christos aese v1.16b, v23.16b 2138 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 5 2139 1.1 christos 2140 1.1 christos aese v3.16b, v23.16b 2141 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 5 2142 1.1 christos 2143 1.1 christos aese v2.16b, v24.16b 2144 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 6 2145 1.1 christos ldr q13, [x3, #64] //load h2l | h2h 2146 1.1 christos #ifndef __AARCH64EB__ 2147 1.1 christos ext v13.16b, v13.16b, v13.16b, #8 2148 1.1 christos #endif 2149 1.1 christos aese v1.16b, v24.16b 2150 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 6 2151 1.1 christos 2152 1.1 christos aese v3.16b, v24.16b 2153 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 6 2154 1.1 christos 2155 1.1 christos aese v0.16b, v25.16b 2156 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 7 2157 1.1 christos 2158 1.1 christos aese v1.16b, v25.16b 2159 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 7 2160 1.1 christos trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 2161 1.1 christos 2162 1.1 christos aese v3.16b, v25.16b 2163 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 7 2164 1.1 christos 2165 1.1 christos aese v0.16b, v26.16b 2166 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 8 2167 1.1 christos 2168 1.1 christos aese v2.16b, v25.16b 2169 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 7 2170 1.1 christos trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 2171 1.1 christos 2172 1.1 christos aese v1.16b, v26.16b 2173 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 8 2174 1.1 christos 2175 1.1 christos aese v3.16b, v26.16b 2176 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 8 2177 1.1 christos 2178 1.1 christos aese v2.16b, v26.16b 2179 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 8 2180 1.1 christos 2181 1.1 christos aese v0.16b, v27.16b 2182 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 9 2183 1.1 christos 2184 1.1 christos aese v3.16b, v27.16b 2185 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 9 2186 1.1 christos 2187 1.1 christos aese v2.16b, v27.16b 2188 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 9 2189 1.1 christos 2190 1.1 christos aese v1.16b, v27.16b 2191 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 9 2192 1.1 christos 2193 1.1 christos aese v0.16b, v28.16b 2194 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 10 2195 1.1 christos 2196 1.1 christos aese v2.16b, v28.16b 2197 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 10 2198 1.1 christos 2199 1.1 christos aese v1.16b, v28.16b 2200 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 10 2201 1.1 christos lsr x5, x1, #3 //byte_len 2202 1.1 christos mov x15, x5 2203 1.1 christos 2204 1.1 christos aese v3.16b, v28.16b 2205 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 10 2206 1.1 christos sub x5, x5, #1 //byte_len - 1 2207 1.1 christos 2208 1.1 christos eor v16.16b, v16.16b, v8.16b //h2k | h1k 2209 1.1 christos and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2210 1.1 christos 2211 1.1 christos eor v17.16b, v17.16b, v9.16b //h4k | h3k 2212 1.1 christos 2213 1.1 christos aese v2.16b, v29.16b //AES block 2 - round 11 2214 1.1 christos add x4, x0, x1, lsr #3 //end_input_ptr 2215 1.1 christos add x5, x5, x0 2216 1.1 christos 2217 1.1 christos aese v1.16b, v29.16b //AES block 1 - round 11 2218 1.1 christos cmp x0, x5 //check if we have <= 4 blocks 2219 1.1 christos 2220 1.1 christos aese v0.16b, v29.16b //AES block 0 - round 11 2221 1.1 christos add w12, w12, #1 //CTR block 3 2222 1.1 christos 2223 1.1 christos aese v3.16b, v29.16b //AES block 3 - round 11 2224 1.1 christos b.ge .L192_enc_tail //handle tail 2225 1.1 christos 2226 1.1 christos rev w9, w12 //CTR block 4 2227 1.1 christos ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 2228 1.1 christos #ifdef __AARCH64EB__ 2229 1.1 christos rev x6, x6 2230 1.1 christos rev x7, x7 2231 1.1 christos #endif 2232 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4 2233 1.1 christos ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 2234 1.1 christos #ifdef __AARCH64EB__ 2235 1.1 christos rev x21, x21 2236 1.1 christos rev x22, x22 2237 1.1 christos #endif 2238 1.1 christos ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 2239 1.1 christos #ifdef __AARCH64EB__ 2240 1.1 christos rev x23, x23 2241 1.1 christos rev x24, x24 2242 1.1 christos #endif 2243 1.1 christos ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 2244 1.1 christos #ifdef __AARCH64EB__ 2245 1.1 christos rev x19, x19 2246 1.1 christos rev x20, x20 2247 1.1 christos #endif 2248 1.1 christos add x0, x0, #64 //AES input_ptr update 2249 1.1 christos cmp x0, x5 //check if we have <= 8 blocks 2250 1.1 christos 2251 1.1 christos eor x6, x6, x13 //AES block 0 - round 12 low 2252 1.1 christos 2253 1.1 christos eor x7, x7, x14 //AES block 0 - round 12 high 2254 1.1 christos eor x22, x22, x14 //AES block 2 - round 12 high 2255 1.1 christos fmov d4, x6 //AES block 0 - mov low 2256 1.1 christos 2257 1.1 christos eor x24, x24, x14 //AES block 3 - round 12 high 2258 1.1 christos fmov v4.d[1], x7 //AES block 0 - mov high 2259 1.1 christos 2260 1.1 christos eor x21, x21, x13 //AES block 2 - round 12 low 2261 1.1 christos eor x19, x19, x13 //AES block 1 - round 12 low 2262 1.1 christos 2263 1.1 christos fmov d5, x19 //AES block 1 - mov low 2264 1.1 christos eor x20, x20, x14 //AES block 1 - round 12 high 2265 1.1 christos 2266 1.1 christos fmov v5.d[1], x20 //AES block 1 - mov high 2267 1.1 christos 2268 1.1 christos eor x23, x23, x13 //AES block 3 - round 12 low 2269 1.1 christos fmov d6, x21 //AES block 2 - mov low 2270 1.1 christos 2271 1.1 christos add w12, w12, #1 //CTR block 4 2272 1.1 christos eor v4.16b, v4.16b, v0.16b //AES block 0 - result 2273 1.1 christos fmov d0, x10 //CTR block 4 2274 1.1 christos 2275 1.1 christos fmov v0.d[1], x9 //CTR block 4 2276 1.1 christos rev w9, w12 //CTR block 5 2277 1.1 christos 2278 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 5 2279 1.1 christos add w12, w12, #1 //CTR block 5 2280 1.1 christos 2281 1.1 christos fmov d7, x23 //AES block 3 - mov low 2282 1.1 christos st1 { v4.16b}, [x2], #16 //AES block 0 - store result 2283 1.1 christos 2284 1.1 christos fmov v6.d[1], x22 //AES block 2 - mov high 2285 1.1 christos 2286 1.1 christos eor v5.16b, v5.16b, v1.16b //AES block 1 - result 2287 1.1 christos fmov d1, x10 //CTR block 5 2288 1.1 christos st1 { v5.16b}, [x2], #16 //AES block 1 - store result 2289 1.1 christos 2290 1.1 christos fmov v7.d[1], x24 //AES block 3 - mov high 2291 1.1 christos 2292 1.1 christos fmov v1.d[1], x9 //CTR block 5 2293 1.1 christos rev w9, w12 //CTR block 6 2294 1.1 christos 2295 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 6 2296 1.1 christos 2297 1.1 christos add w12, w12, #1 //CTR block 6 2298 1.1 christos eor v6.16b, v6.16b, v2.16b //AES block 2 - result 2299 1.1 christos fmov d2, x10 //CTR block 6 2300 1.1 christos 2301 1.1 christos fmov v2.d[1], x9 //CTR block 6 2302 1.1 christos rev w9, w12 //CTR block 7 2303 1.1 christos 2304 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 7 2305 1.1 christos st1 { v6.16b}, [x2], #16 //AES block 2 - store result 2306 1.1 christos 2307 1.1 christos eor v7.16b, v7.16b, v3.16b //AES block 3 - result 2308 1.1 christos st1 { v7.16b}, [x2], #16 //AES block 3 - store result 2309 1.1 christos b.ge .L192_enc_prepretail //do prepretail 2310 1.1 christos 2311 1.1 christos .L192_enc_main_loop: //main loop start 2312 1.1 christos aese v2.16b, v18.16b 2313 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2314 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2315 1.1 christos 2316 1.1 christos aese v1.16b, v18.16b 2317 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2318 1.1 christos ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 2319 1.1 christos #ifdef __AARCH64EB__ 2320 1.1 christos rev x19, x19 2321 1.1 christos rev x20, x20 2322 1.1 christos #endif 2323 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2324 1.1 christos fmov d3, x10 //CTR block 4k+3 2325 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2326 1.1 christos 2327 1.1 christos aese v2.16b, v19.16b 2328 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2329 1.1 christos fmov v3.d[1], x9 //CTR block 4k+3 2330 1.1 christos 2331 1.1 christos pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2332 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2333 1.1 christos ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 2334 1.1 christos #ifdef __AARCH64EB__ 2335 1.1 christos rev x21, x21 2336 1.1 christos rev x22, x22 2337 1.1 christos #endif 2338 1.1 christos aese v0.16b, v18.16b 2339 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2340 1.1 christos ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 2341 1.1 christos #ifdef __AARCH64EB__ 2342 1.1 christos rev x23, x23 2343 1.1 christos rev x24, x24 2344 1.1 christos #endif 2345 1.1 christos pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2346 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 2347 1.1 christos 2348 1.1 christos aese v1.16b, v19.16b 2349 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2350 1.1 christos 2351 1.1 christos aese v0.16b, v19.16b 2352 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2353 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2354 1.1 christos 2355 1.1 christos aese v3.16b, v18.16b 2356 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2357 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 12 high 2358 1.1 christos 2359 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2360 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 2361 1.1 christos 2362 1.1 christos aese v0.16b, v20.16b 2363 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2364 1.1 christos 2365 1.1 christos aese v3.16b, v19.16b 2366 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2367 1.1 christos eor x21, x21, x13 //AES block 4k+6 - round 12 low 2368 1.1 christos 2369 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2370 1.1 christos eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2371 1.1 christos 2372 1.1 christos aese v0.16b, v21.16b 2373 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2374 1.1 christos eor x19, x19, x13 //AES block 4k+5 - round 12 low 2375 1.1 christos 2376 1.1 christos aese v1.16b, v20.16b 2377 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2378 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 2379 1.1 christos 2380 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2381 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 2382 1.1 christos 2383 1.1 christos aese v2.16b, v20.16b 2384 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2385 1.1 christos 2386 1.1 christos aese v1.16b, v21.16b 2387 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2388 1.1 christos 2389 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 2390 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2391 1.1 christos 2392 1.1 christos aese v3.16b, v20.16b 2393 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2394 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2395 1.1 christos 2396 1.1 christos pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2397 1.1 christos 2398 1.1 christos aese v0.16b, v22.16b 2399 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2400 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2401 1.1 christos 2402 1.1 christos aese v3.16b, v21.16b 2403 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2404 1.1 christos 2405 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2406 1.1 christos eor x20, x20, x14 //AES block 4k+5 - round 12 high 2407 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2408 1.1 christos 2409 1.1 christos aese v0.16b, v23.16b 2410 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2411 1.1 christos add w12, w12, #1 //CTR block 4k+3 2412 1.1 christos 2413 1.1 christos aese v3.16b, v22.16b 2414 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2415 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2416 1.1 christos 2417 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2418 1.1 christos eor x22, x22, x14 //AES block 4k+6 - round 12 high 2419 1.1 christos 2420 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2421 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 12 low 2422 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 2423 1.1 christos 2424 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2425 1.1 christos rev w9, w12 //CTR block 4k+8 2426 1.1 christos 2427 1.1 christos pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2428 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+8 2429 1.1 christos 2430 1.1 christos aese v2.16b, v21.16b 2431 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2432 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2433 1.1 christos 2434 1.1 christos aese v1.16b, v22.16b 2435 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2436 1.1 christos ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 2437 1.1 christos #ifdef __AARCH64EB__ 2438 1.1 christos rev x6, x6 2439 1.1 christos rev x7, x7 2440 1.1 christos #endif 2441 1.1 christos aese v0.16b, v24.16b 2442 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2443 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2444 1.1 christos 2445 1.1 christos aese v2.16b, v22.16b 2446 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2447 1.1 christos add x0, x0, #64 //AES input_ptr update 2448 1.1 christos 2449 1.1 christos aese v1.16b, v23.16b 2450 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2451 1.1 christos movi v8.8b, #0xc2 2452 1.1 christos 2453 1.1 christos pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2454 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 12 high 2455 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2456 1.1 christos 2457 1.1 christos aese v2.16b, v23.16b 2458 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2459 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 12 low 2460 1.1 christos 2461 1.1 christos aese v1.16b, v24.16b 2462 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2463 1.1 christos shl d8, d8, #56 //mod_constant 2464 1.1 christos 2465 1.1 christos aese v3.16b, v23.16b 2466 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2467 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2468 1.1 christos 2469 1.1 christos aese v0.16b, v25.16b 2470 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2471 1.1 christos fmov d5, x19 //AES block 4k+5 - mov low 2472 1.1 christos 2473 1.1 christos aese v1.16b, v25.16b 2474 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2475 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2476 1.1 christos 2477 1.1 christos aese v3.16b, v24.16b 2478 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2479 1.1 christos fmov v5.d[1], x20 //AES block 4k+5 - mov high 2480 1.1 christos 2481 1.1 christos aese v0.16b, v26.16b 2482 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2483 1.1 christos eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2484 1.1 christos 2485 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2486 1.1 christos cmp x0, x5 //.LOOP CONTROL 2487 1.1 christos fmov d4, x6 //AES block 4k+4 - mov low 2488 1.1 christos 2489 1.1 christos aese v2.16b, v24.16b 2490 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2491 1.1 christos fmov v4.d[1], x7 //AES block 4k+4 - mov high 2492 1.1 christos 2493 1.1 christos aese v1.16b, v26.16b 2494 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2495 1.1 christos fmov d7, x23 //AES block 4k+3 - mov low 2496 1.1 christos 2497 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2498 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2499 1.1 christos add w12, w12, #1 //CTR block 4k+8 2500 1.1 christos 2501 1.1 christos aese v2.16b, v25.16b 2502 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2503 1.1 christos fmov v7.d[1], x24 //AES block 4k+3 - mov high 2504 1.1 christos 2505 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 2506 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 2507 1.1 christos fmov d6, x21 //AES block 4k+6 - mov low 2508 1.1 christos 2509 1.1 christos aese v3.16b, v25.16b 2510 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2511 1.1 christos 2512 1.1 christos aese v0.16b, v27.16b 2513 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2514 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 2515 1.1 christos 2516 1.1 christos aese v2.16b, v26.16b 2517 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2518 1.1 christos 2519 1.1 christos aese v3.16b, v26.16b 2520 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2521 1.1 christos 2522 1.1 christos aese v1.16b, v27.16b 2523 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2524 1.1 christos 2525 1.1 christos aese v0.16b, v28.16b 2526 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2527 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 2528 1.1 christos 2529 1.1 christos aese v3.16b, v27.16b 2530 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2531 1.1 christos 2532 1.1 christos aese v2.16b, v27.16b 2533 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2534 1.1 christos 2535 1.1 christos aese v0.16b, v29.16b //AES block 4k+4 - round 11 2536 1.1 christos 2537 1.1 christos aese v1.16b, v28.16b 2538 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2539 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 2540 1.1 christos 2541 1.1 christos aese v2.16b, v28.16b 2542 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2543 1.1 christos 2544 1.1 christos eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 2545 1.1 christos fmov d0, x10 //CTR block 4k+8 2546 1.1 christos 2547 1.1 christos aese v1.16b, v29.16b //AES block 4k+5 - round 11 2548 1.1 christos fmov v0.d[1], x9 //CTR block 4k+8 2549 1.1 christos rev w9, w12 //CTR block 4k+9 2550 1.1 christos 2551 1.1 christos pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 2552 1.1 christos fmov v6.d[1], x22 //AES block 4k+6 - mov high 2553 1.1 christos st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 2554 1.1 christos 2555 1.1 christos aese v3.16b, v28.16b 2556 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2557 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+9 2558 1.1 christos 2559 1.1 christos eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 2560 1.1 christos add w12, w12, #1 //CTR block 4k+9 2561 1.1 christos fmov d1, x10 //CTR block 4k+9 2562 1.1 christos 2563 1.1 christos aese v2.16b, v29.16b //AES block 4k+6 - round 11 2564 1.1 christos fmov v1.d[1], x9 //CTR block 4k+9 2565 1.1 christos rev w9, w12 //CTR block 4k+10 2566 1.1 christos 2567 1.1 christos add w12, w12, #1 //CTR block 4k+10 2568 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 2569 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+10 2570 1.1 christos 2571 1.1 christos st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 2572 1.1 christos eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 2573 1.1 christos 2574 1.1 christos aese v3.16b, v29.16b //AES block 4k+7 - round 11 2575 1.1 christos eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 2576 1.1 christos fmov d2, x10 //CTR block 4k+10 2577 1.1 christos 2578 1.1 christos st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 2579 1.1 christos fmov v2.d[1], x9 //CTR block 4k+10 2580 1.1 christos rev w9, w12 //CTR block 4k+11 2581 1.1 christos 2582 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 2583 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+11 2584 1.1 christos 2585 1.1 christos eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 2586 1.1 christos st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 2587 1.1 christos b.lt .L192_enc_main_loop 2588 1.1 christos 2589 1.1 christos .L192_enc_prepretail: //PREPRETAIL 2590 1.1 christos aese v0.16b, v18.16b 2591 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2592 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2593 1.1 christos 2594 1.1 christos fmov d3, x10 //CTR block 4k+3 2595 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2596 1.1 christos add w12, w12, #1 //CTR block 4k+3 2597 1.1 christos 2598 1.1 christos aese v1.16b, v18.16b 2599 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2600 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2601 1.1 christos 2602 1.1 christos aese v2.16b, v18.16b 2603 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2604 1.1 christos 2605 1.1 christos fmov v3.d[1], x9 //CTR block 4k+3 2606 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 2607 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 2608 1.1 christos 2609 1.1 christos aese v1.16b, v19.16b 2610 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2611 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2612 1.1 christos 2613 1.1 christos pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2614 1.1 christos 2615 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2616 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 2617 1.1 christos 2618 1.1 christos pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2619 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2620 1.1 christos 2621 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2622 1.1 christos 2623 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2624 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 2625 1.1 christos 2626 1.1 christos eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2627 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 2628 1.1 christos 2629 1.1 christos aese v3.16b, v18.16b 2630 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2631 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2632 1.1 christos 2633 1.1 christos pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2634 1.1 christos 2635 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2636 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2637 1.1 christos 2638 1.1 christos aese v3.16b, v19.16b 2639 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2640 1.1 christos 2641 1.1 christos aese v2.16b, v19.16b 2642 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2643 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2644 1.1 christos 2645 1.1 christos aese v0.16b, v19.16b 2646 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2647 1.1 christos 2648 1.1 christos aese v1.16b, v20.16b 2649 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2650 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 2651 1.1 christos 2652 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2653 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2654 1.1 christos 2655 1.1 christos aese v0.16b, v20.16b 2656 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2657 1.1 christos 2658 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2659 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2660 1.1 christos 2661 1.1 christos aese v1.16b, v21.16b 2662 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2663 1.1 christos 2664 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2665 1.1 christos 2666 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2667 1.1 christos 2668 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2669 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2670 1.1 christos 2671 1.1 christos pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2672 1.1 christos 2673 1.1 christos aese v0.16b, v21.16b 2674 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2675 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2676 1.1 christos 2677 1.1 christos aese v3.16b, v20.16b 2678 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2679 1.1 christos 2680 1.1 christos aese v2.16b, v20.16b 2681 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2682 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2683 1.1 christos 2684 1.1 christos aese v0.16b, v22.16b 2685 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2686 1.1 christos 2687 1.1 christos aese v3.16b, v21.16b 2688 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2689 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2690 1.1 christos 2691 1.1 christos aese v2.16b, v21.16b 2692 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2693 1.1 christos 2694 1.1 christos pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2695 1.1 christos movi v8.8b, #0xc2 2696 1.1 christos 2697 1.1 christos aese v3.16b, v22.16b 2698 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2699 1.1 christos 2700 1.1 christos aese v2.16b, v22.16b 2701 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2702 1.1 christos 2703 1.1 christos aese v1.16b, v22.16b 2704 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2705 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2706 1.1 christos 2707 1.1 christos aese v3.16b, v23.16b 2708 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2709 1.1 christos 2710 1.1 christos aese v2.16b, v23.16b 2711 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2712 1.1 christos 2713 1.1 christos aese v1.16b, v23.16b 2714 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2715 1.1 christos eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2716 1.1 christos 2717 1.1 christos aese v0.16b, v23.16b 2718 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2719 1.1 christos 2720 1.1 christos aese v3.16b, v24.16b 2721 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2722 1.1 christos eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 2723 1.1 christos 2724 1.1 christos aese v1.16b, v24.16b 2725 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2726 1.1 christos 2727 1.1 christos aese v0.16b, v24.16b 2728 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2729 1.1 christos shl d8, d8, #56 //mod_constant 2730 1.1 christos 2731 1.1 christos aese v3.16b, v25.16b 2732 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2733 1.1 christos 2734 1.1 christos aese v1.16b, v25.16b 2735 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2736 1.1 christos eor v10.16b, v10.16b, v11.16b 2737 1.1 christos 2738 1.1 christos aese v0.16b, v25.16b 2739 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2740 1.1 christos 2741 1.1 christos pmull v30.1q, v9.1d, v8.1d 2742 1.1 christos 2743 1.1 christos aese v2.16b, v24.16b 2744 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2745 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 2746 1.1 christos 2747 1.1 christos aese v0.16b, v26.16b 2748 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2749 1.1 christos 2750 1.1 christos aese v1.16b, v26.16b 2751 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2752 1.1 christos eor v10.16b, v10.16b, v30.16b 2753 1.1 christos 2754 1.1 christos aese v2.16b, v25.16b 2755 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2756 1.1 christos 2757 1.1 christos aese v3.16b, v26.16b 2758 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2759 1.1 christos 2760 1.1 christos aese v0.16b, v27.16b 2761 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2762 1.1 christos 2763 1.1 christos aese v2.16b, v26.16b 2764 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2765 1.1 christos eor v10.16b, v10.16b, v9.16b 2766 1.1 christos 2767 1.1 christos aese v3.16b, v27.16b 2768 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2769 1.1 christos 2770 1.1 christos aese v1.16b, v27.16b 2771 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2772 1.1 christos 2773 1.1 christos aese v2.16b, v27.16b 2774 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2775 1.1 christos 2776 1.1 christos pmull v30.1q, v10.1d, v8.1d 2777 1.1 christos 2778 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 2779 1.1 christos 2780 1.1 christos aese v3.16b, v28.16b 2781 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2782 1.1 christos 2783 1.1 christos aese v0.16b, v28.16b 2784 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2785 1.1 christos 2786 1.1 christos aese v2.16b, v28.16b 2787 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2788 1.1 christos 2789 1.1 christos aese v1.16b, v28.16b 2790 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2791 1.1 christos eor v11.16b, v11.16b, v30.16b 2792 1.1 christos 2793 1.1 christos aese v0.16b, v29.16b //AES block 4k+4 - round 11 2794 1.1 christos 2795 1.1 christos aese v3.16b, v29.16b //AES block 4k+7 - round 11 2796 1.1 christos 2797 1.1 christos aese v2.16b, v29.16b //AES block 4k+6 - round 11 2798 1.1 christos 2799 1.1 christos aese v1.16b, v29.16b //AES block 4k+5 - round 11 2800 1.1 christos eor v11.16b, v11.16b, v10.16b 2801 1.1 christos .L192_enc_tail: //TAIL 2802 1.1 christos 2803 1.1 christos sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 2804 1.1 christos ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 2805 1.1 christos #ifdef __AARCH64EB__ 2806 1.1 christos rev x6, x6 2807 1.1 christos rev x7, x7 2808 1.1 christos #endif 2809 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 12 low 2810 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 12 high 2811 1.1 christos 2812 1.1 christos fmov d4, x6 //AES block 4k+4 - mov low 2813 1.1 christos 2814 1.1 christos fmov v4.d[1], x7 //AES block 4k+4 - mov high 2815 1.1 christos cmp x5, #48 2816 1.1 christos 2817 1.1 christos eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 2818 1.1 christos 2819 1.1 christos ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 2820 1.1 christos b.gt .L192_enc_blocks_more_than_3 2821 1.1 christos 2822 1.1 christos sub w12, w12, #1 2823 1.1 christos movi v10.8b, #0 2824 1.1 christos 2825 1.1 christos mov v3.16b, v2.16b 2826 1.1 christos movi v9.8b, #0 2827 1.1 christos cmp x5, #32 2828 1.1 christos 2829 1.1 christos mov v2.16b, v1.16b 2830 1.1 christos movi v11.8b, #0 2831 1.1 christos b.gt .L192_enc_blocks_more_than_2 2832 1.1 christos 2833 1.1 christos sub w12, w12, #1 2834 1.1 christos 2835 1.1 christos mov v3.16b, v1.16b 2836 1.1 christos cmp x5, #16 2837 1.1 christos b.gt .L192_enc_blocks_more_than_1 2838 1.1 christos 2839 1.1 christos sub w12, w12, #1 2840 1.1 christos b .L192_enc_blocks_less_than_1 2841 1.1 christos .L192_enc_blocks_more_than_3: //blocks left > 3 2842 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 2843 1.1 christos 2844 1.1 christos ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 2845 1.1 christos #ifdef __AARCH64EB__ 2846 1.1 christos rev x6, x6 2847 1.1 christos rev x7, x7 2848 1.1 christos #endif 2849 1.1 christos rev64 v4.16b, v5.16b //GHASH final-3 block 2850 1.1 christos 2851 1.1 christos eor x6, x6, x13 //AES final-2 block - round 12 low 2852 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 2853 1.1 christos 2854 1.1 christos eor x7, x7, x14 //AES final-2 block - round 12 high 2855 1.1 christos fmov d5, x6 //AES final-2 block - mov low 2856 1.1 christos 2857 1.1 christos fmov v5.d[1], x7 //AES final-2 block - mov high 2858 1.1 christos 2859 1.1 christos mov d22, v4.d[1] //GHASH final-3 block - mid 2860 1.1 christos 2861 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 2862 1.1 christos 2863 1.1 christos mov d10, v17.d[1] //GHASH final-3 block - mid 2864 1.1 christos 2865 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 2866 1.1 christos 2867 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 2868 1.1 christos 2869 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 2870 1.1 christos 2871 1.1 christos pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 2872 1.1 christos eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 2873 1.1 christos .L192_enc_blocks_more_than_2: //blocks left > 2 2874 1.1 christos 2875 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 2876 1.1 christos 2877 1.1 christos rev64 v4.16b, v5.16b //GHASH final-2 block 2878 1.1 christos ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 2879 1.1 christos #ifdef __AARCH64EB__ 2880 1.1 christos rev x6, x6 2881 1.1 christos rev x7, x7 2882 1.1 christos #endif 2883 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 2884 1.1 christos 2885 1.1 christos eor x7, x7, x14 //AES final-1 block - round 12 high 2886 1.1 christos 2887 1.1 christos pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 2888 1.1 christos mov d22, v4.d[1] //GHASH final-2 block - mid 2889 1.1 christos 2890 1.1 christos pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 2891 1.1 christos eor x6, x6, x13 //AES final-1 block - round 12 low 2892 1.1 christos 2893 1.1 christos fmov d5, x6 //AES final-1 block - mov low 2894 1.1 christos 2895 1.1 christos fmov v5.d[1], x7 //AES final-1 block - mov high 2896 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 2897 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 2898 1.1 christos 2899 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 2900 1.1 christos 2901 1.1 christos pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 2902 1.1 christos 2903 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 2904 1.1 christos 2905 1.1 christos eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 2906 1.1 christos 2907 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 2908 1.1 christos .L192_enc_blocks_more_than_1: //blocks left > 1 2909 1.1 christos 2910 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 2911 1.1 christos 2912 1.1 christos ldp x6, x7, [x0], #16 //AES final block - load input low & high 2913 1.1 christos #ifdef __AARCH64EB__ 2914 1.1 christos rev x6, x6 2915 1.1 christos rev x7, x7 2916 1.1 christos #endif 2917 1.1 christos rev64 v4.16b, v5.16b //GHASH final-1 block 2918 1.1 christos 2919 1.1 christos eor x6, x6, x13 //AES final block - round 12 low 2920 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 2921 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 2922 1.1 christos 2923 1.1 christos mov d22, v4.d[1] //GHASH final-1 block - mid 2924 1.1 christos 2925 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 2926 1.1 christos eor x7, x7, x14 //AES final block - round 12 high 2927 1.1 christos fmov d5, x6 //AES final block - mov low 2928 1.1 christos 2929 1.1 christos pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 2930 1.1 christos fmov v5.d[1], x7 //AES final block - mov high 2931 1.1 christos 2932 1.1 christos ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 2933 1.1 christos 2934 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 2935 1.1 christos 2936 1.1 christos pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 2937 1.1 christos 2938 1.1 christos pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 2939 1.1 christos 2940 1.1 christos eor v5.16b, v5.16b, v3.16b //AES final block - result 2941 1.1 christos 2942 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 2943 1.1 christos 2944 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 2945 1.1 christos .L192_enc_blocks_less_than_1: //blocks left <= 1 2946 1.1 christos 2947 1.1 christos ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 2948 1.1 christos #ifndef __AARCH64EB__ 2949 1.1 christos rev w9, w12 2950 1.1 christos #else 2951 1.1 christos mov w9, w12 2952 1.1 christos #endif 2953 1.1 christos and x1, x1, #127 //bit_length %= 128 2954 1.1 christos 2955 1.1 christos sub x1, x1, #128 //bit_length -= 128 2956 1.1 christos mvn x14, xzr //rk12_h = 0xffffffffffffffff 2957 1.1 christos 2958 1.1 christos neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 2959 1.1 christos mvn x13, xzr //rk12_l = 0xffffffffffffffff 2960 1.1 christos 2961 1.1 christos and x1, x1, #127 //bit_length %= 128 2962 1.1 christos 2963 1.1 christos lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 2964 1.1 christos cmp x1, #64 2965 1.1 christos 2966 1.1 christos csel x6, x13, x14, lt 2967 1.1 christos csel x7, x14, xzr, lt 2968 1.1 christos 2969 1.1 christos fmov d0, x6 //ctr0b is mask for last block 2970 1.1 christos 2971 1.1 christos fmov v0.d[1], x7 2972 1.1 christos 2973 1.1 christos and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 2974 1.1 christos 2975 1.1 christos rev64 v4.16b, v5.16b //GHASH final block 2976 1.1 christos 2977 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 2978 1.1 christos 2979 1.1 christos mov d8, v4.d[1] //GHASH final block - mid 2980 1.1 christos 2981 1.1 christos pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 2982 1.1 christos 2983 1.1 christos pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 2984 1.1 christos 2985 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 2986 1.1 christos 2987 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final block - low 2988 1.1 christos 2989 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final block - high 2990 1.1 christos 2991 1.1 christos pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 2992 1.1 christos 2993 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 2994 1.1 christos movi v8.8b, #0xc2 2995 1.1 christos 2996 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2997 1.1 christos 2998 1.1 christos shl d8, d8, #56 //mod_constant 2999 1.1 christos 3000 1.1 christos bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 3001 1.1 christos 3002 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3003 1.1 christos 3004 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3005 1.1 christos 3006 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3007 1.1 christos 3008 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3009 1.1 christos 3010 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3011 1.1 christos 3012 1.1 christos pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3013 1.1 christos 3014 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3015 1.1 christos 3016 1.1 christos eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 3017 1.1 christos str w9, [x16, #12] //store the updated counter 3018 1.1 christos 3019 1.1 christos st1 { v5.16b}, [x2] //store all 16B 3020 1.1 christos 3021 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3022 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 3023 1.1 christos rev64 v11.16b, v11.16b 3024 1.1 christos mov x0, x15 3025 1.1 christos st1 { v11.16b }, [x3] 3026 1.1 christos 3027 1.1 christos ldp x21, x22, [sp, #16] 3028 1.1 christos ldp x23, x24, [sp, #32] 3029 1.1 christos ldp d8, d9, [sp, #48] 3030 1.1 christos ldp d10, d11, [sp, #64] 3031 1.1 christos ldp d12, d13, [sp, #80] 3032 1.1 christos ldp d14, d15, [sp, #96] 3033 1.1 christos ldp x19, x20, [sp], #112 3034 1.1 christos ret 3035 1.1 christos 3036 1.1 christos .L192_enc_ret: 3037 1.1 christos mov w0, #0x0 3038 1.1 christos ret 3039 1.1 christos .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3040 1.1 christos .globl aes_gcm_dec_192_kernel 3041 1.1 christos .type aes_gcm_dec_192_kernel,%function 3042 1.1 christos .align 4 3043 1.1 christos aes_gcm_dec_192_kernel: 3044 1.2 christos AARCH64_VALID_CALL_TARGET 3045 1.1 christos cbz x1, .L192_dec_ret 3046 1.1 christos stp x19, x20, [sp, #-112]! 3047 1.1 christos mov x16, x4 3048 1.1 christos mov x8, x5 3049 1.1 christos stp x21, x22, [sp, #16] 3050 1.1 christos stp x23, x24, [sp, #32] 3051 1.1 christos stp d8, d9, [sp, #48] 3052 1.1 christos stp d10, d11, [sp, #64] 3053 1.1 christos stp d12, d13, [sp, #80] 3054 1.1 christos stp d14, d15, [sp, #96] 3055 1.1 christos 3056 1.1 christos add x4, x0, x1, lsr #3 //end_input_ptr 3057 1.1 christos ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 3058 1.1 christos #ifdef __AARCH64EB__ 3059 1.1 christos rev x10, x10 3060 1.1 christos rev x11, x11 3061 1.1 christos #endif 3062 1.1 christos ldp x13, x14, [x8, #192] //load rk12 3063 1.1 christos #ifdef __AARCH64EB__ 3064 1.1 christos ror x13, x13, #32 3065 1.1 christos ror x14, x14, #32 3066 1.1 christos #endif 3067 1.1 christos ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 3068 1.1 christos 3069 1.1 christos ld1 {v18.4s}, [x8], #16 //load rk0 3070 1.1 christos 3071 1.1 christos lsr x5, x1, #3 //byte_len 3072 1.1 christos mov x15, x5 3073 1.1 christos ld1 {v19.4s}, [x8], #16 //load rk1 3074 1.1 christos 3075 1.1 christos lsr x12, x11, #32 3076 1.1 christos orr w11, w11, w11 3077 1.1 christos fmov d3, x10 //CTR block 3 3078 1.1 christos 3079 1.1 christos rev w12, w12 //rev_ctr32 3080 1.1 christos fmov d1, x10 //CTR block 1 3081 1.1 christos 3082 1.1 christos add w12, w12, #1 //increment rev_ctr32 3083 1.1 christos ld1 {v20.4s}, [x8], #16 //load rk2 3084 1.1 christos 3085 1.1 christos aese v0.16b, v18.16b 3086 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 0 3087 1.1 christos rev w9, w12 //CTR block 1 3088 1.1 christos 3089 1.1 christos add w12, w12, #1 //CTR block 1 3090 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 1 3091 1.1 christos ld1 {v21.4s}, [x8], #16 //load rk3 3092 1.1 christos 3093 1.1 christos fmov v1.d[1], x9 //CTR block 1 3094 1.1 christos rev w9, w12 //CTR block 2 3095 1.1 christos add w12, w12, #1 //CTR block 2 3096 1.1 christos 3097 1.1 christos fmov d2, x10 //CTR block 2 3098 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 2 3099 1.1 christos 3100 1.1 christos fmov v2.d[1], x9 //CTR block 2 3101 1.1 christos rev w9, w12 //CTR block 3 3102 1.1 christos 3103 1.1 christos aese v0.16b, v19.16b 3104 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 1 3105 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 3 3106 1.1 christos 3107 1.1 christos fmov v3.d[1], x9 //CTR block 3 3108 1.1 christos 3109 1.1 christos ld1 {v22.4s}, [x8], #16 //load rk4 3110 1.1 christos 3111 1.1 christos aese v0.16b, v20.16b 3112 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 2 3113 1.1 christos 3114 1.1 christos aese v2.16b, v18.16b 3115 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 0 3116 1.1 christos ld1 {v23.4s}, [x8], #16 //load rk5 3117 1.1 christos 3118 1.1 christos aese v1.16b, v18.16b 3119 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 0 3120 1.1 christos ldr q15, [x3, #112] //load h4l | h4h 3121 1.1 christos #ifndef __AARCH64EB__ 3122 1.1 christos ext v15.16b, v15.16b, v15.16b, #8 3123 1.1 christos #endif 3124 1.1 christos aese v3.16b, v18.16b 3125 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 0 3126 1.1 christos ldr q13, [x3, #64] //load h2l | h2h 3127 1.1 christos #ifndef __AARCH64EB__ 3128 1.1 christos ext v13.16b, v13.16b, v13.16b, #8 3129 1.1 christos #endif 3130 1.1 christos aese v2.16b, v19.16b 3131 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 1 3132 1.1 christos ldr q14, [x3, #80] //load h3l | h3h 3133 1.1 christos #ifndef __AARCH64EB__ 3134 1.1 christos ext v14.16b, v14.16b, v14.16b, #8 3135 1.1 christos #endif 3136 1.1 christos aese v1.16b, v19.16b 3137 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 1 3138 1.1 christos 3139 1.1 christos aese v3.16b, v19.16b 3140 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 1 3141 1.1 christos ldr q12, [x3, #32] //load h1l | h1h 3142 1.1 christos #ifndef __AARCH64EB__ 3143 1.1 christos ext v12.16b, v12.16b, v12.16b, #8 3144 1.1 christos #endif 3145 1.1 christos aese v2.16b, v20.16b 3146 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 2 3147 1.1 christos ld1 {v24.4s}, [x8], #16 //load rk6 3148 1.1 christos 3149 1.1 christos aese v0.16b, v21.16b 3150 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 3 3151 1.1 christos ld1 {v25.4s}, [x8], #16 //load rk7 3152 1.1 christos 3153 1.1 christos aese v1.16b, v20.16b 3154 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 2 3155 1.1 christos ld1 {v26.4s}, [x8], #16 //load rk8 3156 1.1 christos 3157 1.1 christos aese v3.16b, v20.16b 3158 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 2 3159 1.1 christos ld1 {v27.4s}, [x8], #16 //load rk9 3160 1.1 christos 3161 1.1 christos aese v2.16b, v21.16b 3162 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 3 3163 1.1 christos ld1 { v11.16b}, [x3] 3164 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 3165 1.1 christos rev64 v11.16b, v11.16b 3166 1.1 christos 3167 1.1 christos aese v1.16b, v21.16b 3168 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 3 3169 1.1 christos add w12, w12, #1 //CTR block 3 3170 1.1 christos 3171 1.1 christos aese v3.16b, v21.16b 3172 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 3 3173 1.1 christos trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 3174 1.1 christos 3175 1.1 christos aese v0.16b, v22.16b 3176 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 4 3177 1.1 christos ld1 {v28.4s}, [x8], #16 //load rk10 3178 1.1 christos 3179 1.1 christos aese v1.16b, v22.16b 3180 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 4 3181 1.1 christos trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 3182 1.1 christos 3183 1.1 christos aese v2.16b, v22.16b 3184 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 4 3185 1.1 christos 3186 1.1 christos aese v3.16b, v22.16b 3187 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 4 3188 1.1 christos trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 3189 1.1 christos 3190 1.1 christos aese v0.16b, v23.16b 3191 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 5 3192 1.1 christos ld1 {v29.4s}, [x8], #16 //load rk11 3193 1.1 christos 3194 1.1 christos aese v1.16b, v23.16b 3195 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 5 3196 1.1 christos 3197 1.1 christos aese v2.16b, v23.16b 3198 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 5 3199 1.1 christos 3200 1.1 christos aese v3.16b, v23.16b 3201 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 5 3202 1.1 christos 3203 1.1 christos aese v0.16b, v24.16b 3204 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 6 3205 1.1 christos 3206 1.1 christos aese v2.16b, v24.16b 3207 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 6 3208 1.1 christos 3209 1.1 christos aese v3.16b, v24.16b 3210 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 6 3211 1.1 christos 3212 1.1 christos aese v0.16b, v25.16b 3213 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 7 3214 1.1 christos 3215 1.1 christos aese v2.16b, v25.16b 3216 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 7 3217 1.1 christos 3218 1.1 christos aese v3.16b, v25.16b 3219 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 7 3220 1.1 christos 3221 1.1 christos aese v1.16b, v24.16b 3222 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 6 3223 1.1 christos 3224 1.1 christos aese v2.16b, v26.16b 3225 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 8 3226 1.1 christos 3227 1.1 christos aese v3.16b, v26.16b 3228 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 8 3229 1.1 christos 3230 1.1 christos aese v1.16b, v25.16b 3231 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 7 3232 1.1 christos 3233 1.1 christos aese v2.16b, v27.16b 3234 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 9 3235 1.1 christos 3236 1.1 christos aese v3.16b, v27.16b 3237 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 9 3238 1.1 christos 3239 1.1 christos aese v1.16b, v26.16b 3240 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 8 3241 1.1 christos sub x5, x5, #1 //byte_len - 1 3242 1.1 christos 3243 1.1 christos aese v0.16b, v26.16b 3244 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 8 3245 1.1 christos and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3246 1.1 christos 3247 1.1 christos aese v3.16b, v28.16b 3248 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 10 3249 1.1 christos add x5, x5, x0 3250 1.1 christos 3251 1.1 christos aese v1.16b, v27.16b 3252 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 9 3253 1.1 christos cmp x0, x5 //check if we have <= 4 blocks 3254 1.1 christos 3255 1.1 christos aese v0.16b, v27.16b 3256 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 9 3257 1.1 christos trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 3258 1.1 christos 3259 1.1 christos aese v3.16b, v29.16b //AES block 3 - round 11 3260 1.1 christos 3261 1.1 christos aese v2.16b, v28.16b 3262 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 10 3263 1.1 christos 3264 1.1 christos aese v1.16b, v28.16b 3265 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 10 3266 1.1 christos 3267 1.1 christos aese v0.16b, v28.16b 3268 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 10 3269 1.1 christos eor v16.16b, v16.16b, v8.16b //h2k | h1k 3270 1.1 christos 3271 1.1 christos aese v2.16b, v29.16b //AES block 2 - round 11 3272 1.1 christos 3273 1.1 christos aese v1.16b, v29.16b //AES block 1 - round 11 3274 1.1 christos eor v17.16b, v17.16b, v9.16b //h4k | h3k 3275 1.1 christos 3276 1.1 christos aese v0.16b, v29.16b //AES block 0 - round 11 3277 1.1 christos b.ge .L192_dec_tail //handle tail 3278 1.1 christos 3279 1.1 christos ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 3280 1.1 christos 3281 1.1 christos eor v1.16b, v5.16b, v1.16b //AES block 1 - result 3282 1.1 christos 3283 1.1 christos eor v0.16b, v4.16b, v0.16b //AES block 0 - result 3284 1.1 christos rev w9, w12 //CTR block 4 3285 1.1 christos ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext 3286 1.1 christos 3287 1.1 christos mov x19, v1.d[0] //AES block 1 - mov low 3288 1.1 christos 3289 1.1 christos mov x20, v1.d[1] //AES block 1 - mov high 3290 1.1 christos 3291 1.1 christos mov x6, v0.d[0] //AES block 0 - mov low 3292 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4 3293 1.1 christos add w12, w12, #1 //CTR block 4 3294 1.1 christos 3295 1.1 christos mov x7, v0.d[1] //AES block 0 - mov high 3296 1.1 christos rev64 v4.16b, v4.16b //GHASH block 0 3297 1.1 christos 3298 1.1 christos fmov d0, x10 //CTR block 4 3299 1.1 christos rev64 v5.16b, v5.16b //GHASH block 1 3300 1.1 christos cmp x0, x5 //check if we have <= 8 blocks 3301 1.1 christos 3302 1.1 christos eor x19, x19, x13 //AES block 1 - round 12 low 3303 1.1 christos #ifdef __AARCH64EB__ 3304 1.1 christos rev x19, x19 3305 1.1 christos #endif 3306 1.1 christos fmov v0.d[1], x9 //CTR block 4 3307 1.1 christos rev w9, w12 //CTR block 5 3308 1.1 christos 3309 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 5 3310 1.1 christos fmov d1, x10 //CTR block 5 3311 1.1 christos eor x20, x20, x14 //AES block 1 - round 12 high 3312 1.1 christos #ifdef __AARCH64EB__ 3313 1.1 christos rev x20, x20 3314 1.1 christos #endif 3315 1.1 christos add w12, w12, #1 //CTR block 5 3316 1.1 christos fmov v1.d[1], x9 //CTR block 5 3317 1.1 christos eor x6, x6, x13 //AES block 0 - round 12 low 3318 1.1 christos #ifdef __AARCH64EB__ 3319 1.1 christos rev x6, x6 3320 1.1 christos #endif 3321 1.1 christos rev w9, w12 //CTR block 6 3322 1.1 christos eor x7, x7, x14 //AES block 0 - round 12 high 3323 1.1 christos #ifdef __AARCH64EB__ 3324 1.1 christos rev x7, x7 3325 1.1 christos #endif 3326 1.1 christos stp x6, x7, [x2], #16 //AES block 0 - store result 3327 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 6 3328 1.1 christos 3329 1.1 christos stp x19, x20, [x2], #16 //AES block 1 - store result 3330 1.1 christos 3331 1.1 christos add w12, w12, #1 //CTR block 6 3332 1.1 christos eor v2.16b, v6.16b, v2.16b //AES block 2 - result 3333 1.1 christos b.ge .L192_dec_prepretail //do prepretail 3334 1.1 christos 3335 1.1 christos .L192_dec_main_loop: //main loop start 3336 1.1 christos aese v1.16b, v18.16b 3337 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3338 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3339 1.1 christos 3340 1.1 christos pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3341 1.1 christos mov x21, v2.d[0] //AES block 4k+2 - mov low 3342 1.1 christos 3343 1.1 christos mov x22, v2.d[1] //AES block 4k+2 - mov high 3344 1.1 christos eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3345 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 3346 1.1 christos 3347 1.1 christos aese v1.16b, v19.16b 3348 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3349 1.1 christos fmov d2, x10 //CTR block 4k+6 3350 1.1 christos 3351 1.1 christos aese v0.16b, v18.16b 3352 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3353 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 3354 1.1 christos 3355 1.1 christos pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3356 1.1 christos fmov v2.d[1], x9 //CTR block 4k+6 3357 1.1 christos 3358 1.1 christos aese v1.16b, v20.16b 3359 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3360 1.1 christos mov x24, v3.d[1] //AES block 4k+3 - mov high 3361 1.1 christos 3362 1.1 christos aese v0.16b, v19.16b 3363 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3364 1.1 christos mov x23, v3.d[0] //AES block 4k+3 - mov low 3365 1.1 christos 3366 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3367 1.1 christos fmov d3, x10 //CTR block 4k+7 3368 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 3369 1.1 christos 3370 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3371 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 3372 1.1 christos rev w9, w12 //CTR block 4k+7 3373 1.1 christos 3374 1.1 christos aese v2.16b, v18.16b 3375 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3376 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+7 3377 1.1 christos 3378 1.1 christos fmov v3.d[1], x9 //CTR block 4k+7 3379 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3380 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 3381 1.1 christos 3382 1.1 christos aese v1.16b, v21.16b 3383 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3384 1.1 christos 3385 1.1 christos aese v0.16b, v20.16b 3386 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3387 1.1 christos eor x22, x22, x14 //AES block 4k+2 - round 12 high 3388 1.1 christos #ifdef __AARCH64EB__ 3389 1.1 christos rev x22, x22 3390 1.1 christos #endif 3391 1.1 christos aese v2.16b, v19.16b 3392 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3393 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3394 1.1 christos 3395 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3396 1.1 christos 3397 1.1 christos aese v3.16b, v18.16b 3398 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3399 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 3400 1.1 christos 3401 1.1 christos aese v2.16b, v20.16b 3402 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3403 1.1 christos 3404 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3405 1.1 christos eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3406 1.1 christos eor x21, x21, x13 //AES block 4k+2 - round 12 low 3407 1.1 christos #ifdef __AARCH64EB__ 3408 1.1 christos rev x21, x21 3409 1.1 christos #endif 3410 1.1 christos aese v1.16b, v22.16b 3411 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3412 1.1 christos 3413 1.1 christos aese v0.16b, v21.16b 3414 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3415 1.1 christos 3416 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3417 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 3418 1.1 christos 3419 1.1 christos aese v3.16b, v19.16b 3420 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3421 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3422 1.1 christos 3423 1.1 christos aese v0.16b, v22.16b 3424 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3425 1.1 christos 3426 1.1 christos pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3427 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3428 1.1 christos 3429 1.1 christos pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3430 1.1 christos 3431 1.1 christos aese v0.16b, v23.16b 3432 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3433 1.1 christos 3434 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3435 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 3436 1.1 christos 3437 1.1 christos aese v1.16b, v23.16b 3438 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3439 1.1 christos 3440 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3441 1.1 christos 3442 1.1 christos aese v3.16b, v20.16b 3443 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3444 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3445 1.1 christos 3446 1.1 christos aese v1.16b, v24.16b 3447 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3448 1.1 christos 3449 1.1 christos aese v0.16b, v24.16b 3450 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3451 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3452 1.1 christos 3453 1.1 christos aese v3.16b, v21.16b 3454 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3455 1.1 christos 3456 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3457 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3458 1.1 christos 3459 1.1 christos aese v0.16b, v25.16b 3460 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3461 1.1 christos 3462 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3463 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3464 1.1 christos 3465 1.1 christos aese v1.16b, v25.16b 3466 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3467 1.1 christos 3468 1.1 christos aese v0.16b, v26.16b 3469 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3470 1.1 christos movi v8.8b, #0xc2 3471 1.1 christos 3472 1.1 christos pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3473 1.1 christos 3474 1.1 christos aese v1.16b, v26.16b 3475 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3476 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3477 1.1 christos 3478 1.1 christos aese v2.16b, v21.16b 3479 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3480 1.1 christos 3481 1.1 christos aese v0.16b, v27.16b 3482 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3483 1.1 christos eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3484 1.1 christos 3485 1.1 christos aese v3.16b, v22.16b 3486 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3487 1.1 christos 3488 1.1 christos aese v2.16b, v22.16b 3489 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3490 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3491 1.1 christos 3492 1.1 christos aese v0.16b, v28.16b 3493 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3494 1.1 christos 3495 1.1 christos aese v1.16b, v27.16b 3496 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3497 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3498 1.1 christos 3499 1.1 christos aese v2.16b, v23.16b 3500 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3501 1.1 christos 3502 1.1 christos aese v3.16b, v23.16b 3503 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3504 1.1 christos shl d8, d8, #56 //mod_constant 3505 1.1 christos 3506 1.1 christos aese v1.16b, v28.16b 3507 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3508 1.1 christos 3509 1.1 christos aese v2.16b, v24.16b 3510 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3511 1.1 christos ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3512 1.1 christos 3513 1.1 christos aese v3.16b, v24.16b 3514 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3515 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3516 1.1 christos 3517 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3518 1.1 christos ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 3519 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 12 low 3520 1.1 christos #ifdef __AARCH64EB__ 3521 1.1 christos rev x23, x23 3522 1.1 christos #endif 3523 1.1 christos aese v2.16b, v25.16b 3524 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3525 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3526 1.1 christos 3527 1.1 christos aese v0.16b, v29.16b //AES block 4k+4 - round 11 3528 1.1 christos add w12, w12, #1 //CTR block 4k+7 3529 1.1 christos 3530 1.1 christos aese v3.16b, v25.16b 3531 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3532 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3533 1.1 christos 3534 1.1 christos aese v2.16b, v26.16b 3535 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3536 1.1 christos ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 3537 1.1 christos 3538 1.1 christos aese v1.16b, v29.16b //AES block 4k+5 - round 11 3539 1.1 christos ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 3540 1.1 christos rev w9, w12 //CTR block 4k+8 3541 1.1 christos 3542 1.1 christos aese v3.16b, v26.16b 3543 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3544 1.1 christos stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3545 1.1 christos 3546 1.1 christos aese v2.16b, v27.16b 3547 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3548 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3549 1.1 christos 3550 1.1 christos cmp x0, x5 //.LOOP CONTROL 3551 1.1 christos 3552 1.1 christos eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 3553 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 12 high 3554 1.1 christos #ifdef __AARCH64EB__ 3555 1.1 christos rev x24, x24 3556 1.1 christos #endif 3557 1.1 christos eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 3558 1.1 christos 3559 1.1 christos aese v2.16b, v28.16b 3560 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3561 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+8 3562 1.1 christos 3563 1.1 christos aese v3.16b, v27.16b 3564 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3565 1.1 christos 3566 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3567 1.1 christos mov x19, v1.d[0] //AES block 4k+5 - mov low 3568 1.1 christos 3569 1.1 christos mov x6, v0.d[0] //AES block 4k+4 - mov low 3570 1.1 christos stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3571 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+5 3572 1.1 christos 3573 1.1 christos aese v2.16b, v29.16b //AES block 4k+6 - round 11 3574 1.1 christos mov x7, v0.d[1] //AES block 4k+4 - mov high 3575 1.1 christos 3576 1.1 christos aese v3.16b, v28.16b 3577 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3578 1.1 christos mov x20, v1.d[1] //AES block 4k+5 - mov high 3579 1.1 christos 3580 1.1 christos fmov d0, x10 //CTR block 4k+8 3581 1.1 christos add w12, w12, #1 //CTR block 4k+8 3582 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3583 1.1 christos 3584 1.1 christos eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 3585 1.1 christos fmov v0.d[1], x9 //CTR block 4k+8 3586 1.1 christos rev w9, w12 //CTR block 4k+9 3587 1.1 christos 3588 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 12 low 3589 1.1 christos #ifdef __AARCH64EB__ 3590 1.1 christos rev x6, x6 3591 1.1 christos #endif 3592 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+9 3593 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3594 1.1 christos 3595 1.1 christos fmov d1, x10 //CTR block 4k+9 3596 1.1 christos add w12, w12, #1 //CTR block 4k+9 3597 1.1 christos eor x19, x19, x13 //AES block 4k+5 - round 12 low 3598 1.1 christos #ifdef __AARCH64EB__ 3599 1.1 christos rev x19, x19 3600 1.1 christos #endif 3601 1.1 christos fmov v1.d[1], x9 //CTR block 4k+9 3602 1.1 christos rev w9, w12 //CTR block 4k+10 3603 1.1 christos eor x20, x20, x14 //AES block 4k+5 - round 12 high 3604 1.1 christos #ifdef __AARCH64EB__ 3605 1.1 christos rev x20, x20 3606 1.1 christos #endif 3607 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 12 high 3608 1.1 christos #ifdef __AARCH64EB__ 3609 1.1 christos rev x7, x7 3610 1.1 christos #endif 3611 1.1 christos stp x6, x7, [x2], #16 //AES block 4k+4 - store result 3612 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3613 1.1 christos 3614 1.1 christos add w12, w12, #1 //CTR block 4k+10 3615 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k+4 3616 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+10 3617 1.1 christos 3618 1.1 christos aese v3.16b, v29.16b //AES block 4k+7 - round 11 3619 1.1 christos stp x19, x20, [x2], #16 //AES block 4k+5 - store result 3620 1.1 christos b.lt .L192_dec_main_loop 3621 1.1 christos 3622 1.1 christos .L192_dec_prepretail: //PREPRETAIL 3623 1.1 christos mov x22, v2.d[1] //AES block 4k+2 - mov high 3624 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3625 1.1 christos eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3626 1.1 christos 3627 1.1 christos aese v1.16b, v18.16b 3628 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3629 1.1 christos mov x21, v2.d[0] //AES block 4k+2 - mov low 3630 1.1 christos 3631 1.1 christos aese v0.16b, v18.16b 3632 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3633 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 3634 1.1 christos 3635 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 3636 1.1 christos fmov d2, x10 //CTR block 4k+6 3637 1.1 christos 3638 1.1 christos aese v1.16b, v19.16b 3639 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3640 1.1 christos mov x23, v3.d[0] //AES block 4k+3 - mov low 3641 1.1 christos 3642 1.1 christos aese v0.16b, v19.16b 3643 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3644 1.1 christos mov x24, v3.d[1] //AES block 4k+3 - mov high 3645 1.1 christos 3646 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3647 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 3648 1.1 christos fmov d3, x10 //CTR block 4k+7 3649 1.1 christos 3650 1.1 christos aese v1.16b, v20.16b 3651 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3652 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 3653 1.1 christos 3654 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3655 1.1 christos fmov v2.d[1], x9 //CTR block 4k+6 3656 1.1 christos rev w9, w12 //CTR block 4k+7 3657 1.1 christos 3658 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+7 3659 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3660 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 3661 1.1 christos 3662 1.1 christos pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3663 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 12 high 3664 1.1 christos #ifdef __AARCH64EB__ 3665 1.1 christos rev x24, x24 3666 1.1 christos #endif 3667 1.1 christos fmov v3.d[1], x9 //CTR block 4k+7 3668 1.1 christos 3669 1.1 christos aese v0.16b, v20.16b 3670 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3671 1.1 christos eor x21, x21, x13 //AES block 4k+2 - round 12 low 3672 1.1 christos #ifdef __AARCH64EB__ 3673 1.1 christos rev x21, x21 3674 1.1 christos #endif 3675 1.1 christos pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3676 1.1 christos eor x22, x22, x14 //AES block 4k+2 - round 12 high 3677 1.1 christos #ifdef __AARCH64EB__ 3678 1.1 christos rev x22, x22 3679 1.1 christos #endif 3680 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3681 1.1 christos 3682 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3683 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 12 low 3684 1.1 christos #ifdef __AARCH64EB__ 3685 1.1 christos rev x23, x23 3686 1.1 christos #endif 3687 1.1 christos stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3688 1.1 christos 3689 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 3690 1.1 christos stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3691 1.1 christos 3692 1.1 christos aese v3.16b, v18.16b 3693 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3694 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3695 1.1 christos 3696 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3697 1.1 christos add w12, w12, #1 //CTR block 4k+7 3698 1.1 christos 3699 1.1 christos pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3700 1.1 christos eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3701 1.1 christos 3702 1.1 christos aese v2.16b, v18.16b 3703 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3704 1.1 christos 3705 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3706 1.1 christos mov d31, v6.d[1] //GHASH block 4k+2 - mid 3707 1.1 christos 3708 1.1 christos aese v3.16b, v19.16b 3709 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3710 1.1 christos 3711 1.1 christos aese v2.16b, v19.16b 3712 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3713 1.1 christos eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3714 1.1 christos 3715 1.1 christos eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3716 1.1 christos 3717 1.1 christos pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3718 1.1 christos 3719 1.1 christos aese v2.16b, v20.16b 3720 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3721 1.1 christos mov d30, v7.d[1] //GHASH block 4k+3 - mid 3722 1.1 christos 3723 1.1 christos aese v3.16b, v20.16b 3724 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3725 1.1 christos ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3726 1.1 christos 3727 1.1 christos pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3728 1.1 christos 3729 1.1 christos aese v0.16b, v21.16b 3730 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3731 1.1 christos eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3732 1.1 christos 3733 1.1 christos aese v1.16b, v21.16b 3734 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3735 1.1 christos 3736 1.1 christos pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3737 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3738 1.1 christos 3739 1.1 christos aese v0.16b, v22.16b 3740 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3741 1.1 christos 3742 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3743 1.1 christos movi v8.8b, #0xc2 3744 1.1 christos 3745 1.1 christos pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3746 1.1 christos 3747 1.1 christos aese v2.16b, v21.16b 3748 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3749 1.1 christos 3750 1.1 christos shl d8, d8, #56 //mod_constant 3751 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3752 1.1 christos 3753 1.1 christos aese v0.16b, v23.16b 3754 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3755 1.1 christos eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3756 1.1 christos 3757 1.1 christos aese v2.16b, v22.16b 3758 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3759 1.1 christos 3760 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3761 1.1 christos eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3762 1.1 christos 3763 1.1 christos aese v0.16b, v24.16b 3764 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3765 1.1 christos 3766 1.1 christos aese v3.16b, v21.16b 3767 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3768 1.1 christos eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3769 1.1 christos 3770 1.1 christos aese v2.16b, v23.16b 3771 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3772 1.1 christos 3773 1.1 christos aese v0.16b, v25.16b 3774 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3775 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3776 1.1 christos 3777 1.1 christos aese v3.16b, v22.16b 3778 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3779 1.1 christos 3780 1.1 christos aese v2.16b, v24.16b 3781 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3782 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3783 1.1 christos 3784 1.1 christos aese v0.16b, v26.16b 3785 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3786 1.1 christos 3787 1.1 christos aese v3.16b, v23.16b 3788 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3789 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3790 1.1 christos 3791 1.1 christos aese v1.16b, v22.16b 3792 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3793 1.1 christos 3794 1.1 christos aese v2.16b, v25.16b 3795 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3796 1.1 christos 3797 1.1 christos aese v0.16b, v27.16b 3798 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3799 1.1 christos 3800 1.1 christos aese v1.16b, v23.16b 3801 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3802 1.1 christos 3803 1.1 christos aese v3.16b, v24.16b 3804 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3805 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3806 1.1 christos 3807 1.1 christos aese v0.16b, v28.16b 3808 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3809 1.1 christos 3810 1.1 christos aese v1.16b, v24.16b 3811 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3812 1.1 christos 3813 1.1 christos aese v3.16b, v25.16b 3814 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3815 1.1 christos 3816 1.1 christos aese v2.16b, v26.16b 3817 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3818 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3819 1.1 christos 3820 1.1 christos aese v1.16b, v25.16b 3821 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3822 1.1 christos 3823 1.1 christos aese v3.16b, v26.16b 3824 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3825 1.1 christos 3826 1.1 christos aese v2.16b, v27.16b 3827 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3828 1.1 christos 3829 1.1 christos aese v1.16b, v26.16b 3830 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3831 1.1 christos 3832 1.1 christos aese v3.16b, v27.16b 3833 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3834 1.1 christos 3835 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3836 1.1 christos 3837 1.1 christos aese v1.16b, v27.16b 3838 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3839 1.1 christos 3840 1.1 christos aese v2.16b, v28.16b 3841 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3842 1.1 christos 3843 1.1 christos aese v3.16b, v28.16b 3844 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3845 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3846 1.1 christos 3847 1.1 christos aese v1.16b, v28.16b 3848 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3849 1.1 christos 3850 1.1 christos aese v0.16b, v29.16b 3851 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3852 1.1 christos 3853 1.1 christos aese v2.16b, v29.16b 3854 1.1 christos 3855 1.1 christos aese v1.16b, v29.16b 3856 1.1 christos 3857 1.1 christos aese v3.16b, v29.16b 3858 1.1 christos 3859 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3860 1.1 christos .L192_dec_tail: //TAIL 3861 1.1 christos 3862 1.1 christos sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 3863 1.1 christos ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3864 1.1 christos 3865 1.1 christos eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 3866 1.1 christos 3867 1.1 christos mov x7, v0.d[1] //AES block 4k+4 - mov high 3868 1.1 christos 3869 1.1 christos mov x6, v0.d[0] //AES block 4k+4 - mov low 3870 1.1 christos 3871 1.1 christos ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 3872 1.1 christos 3873 1.1 christos cmp x5, #48 3874 1.1 christos 3875 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 12 high 3876 1.1 christos #ifdef __AARCH64EB__ 3877 1.1 christos rev x7, x7 3878 1.1 christos #endif 3879 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 12 low 3880 1.1 christos #ifdef __AARCH64EB__ 3881 1.1 christos rev x6, x6 3882 1.1 christos #endif 3883 1.1 christos b.gt .L192_dec_blocks_more_than_3 3884 1.1 christos 3885 1.1 christos movi v11.8b, #0 3886 1.1 christos movi v9.8b, #0 3887 1.1 christos 3888 1.1 christos mov v3.16b, v2.16b 3889 1.1 christos mov v2.16b, v1.16b 3890 1.1 christos sub w12, w12, #1 3891 1.1 christos 3892 1.1 christos movi v10.8b, #0 3893 1.1 christos cmp x5, #32 3894 1.1 christos b.gt .L192_dec_blocks_more_than_2 3895 1.1 christos 3896 1.1 christos mov v3.16b, v1.16b 3897 1.1 christos cmp x5, #16 3898 1.1 christos sub w12, w12, #1 3899 1.1 christos 3900 1.1 christos b.gt .L192_dec_blocks_more_than_1 3901 1.1 christos 3902 1.1 christos sub w12, w12, #1 3903 1.1 christos b .L192_dec_blocks_less_than_1 3904 1.1 christos .L192_dec_blocks_more_than_3: //blocks left > 3 3905 1.1 christos rev64 v4.16b, v5.16b //GHASH final-3 block 3906 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 3907 1.1 christos 3908 1.1 christos stp x6, x7, [x2], #16 //AES final-3 block - store result 3909 1.1 christos 3910 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 3911 1.1 christos 3912 1.1 christos eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 3913 1.1 christos 3914 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 3915 1.1 christos mov x6, v0.d[0] //AES final-2 block - mov low 3916 1.1 christos mov d22, v4.d[1] //GHASH final-3 block - mid 3917 1.1 christos 3918 1.1 christos mov x7, v0.d[1] //AES final-2 block - mov high 3919 1.1 christos 3920 1.1 christos mov d10, v17.d[1] //GHASH final-3 block - mid 3921 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 3922 1.1 christos 3923 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 3924 1.1 christos 3925 1.1 christos eor x6, x6, x13 //AES final-2 block - round 12 low 3926 1.1 christos #ifdef __AARCH64EB__ 3927 1.1 christos rev x6, x6 3928 1.1 christos #endif 3929 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 3930 1.1 christos 3931 1.1 christos pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 3932 1.1 christos eor x7, x7, x14 //AES final-2 block - round 12 high 3933 1.1 christos #ifdef __AARCH64EB__ 3934 1.1 christos rev x7, x7 3935 1.1 christos #endif 3936 1.1 christos .L192_dec_blocks_more_than_2: //blocks left > 2 3937 1.1 christos 3938 1.1 christos rev64 v4.16b, v5.16b //GHASH final-2 block 3939 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 3940 1.1 christos 3941 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 3942 1.1 christos 3943 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 3944 1.1 christos 3945 1.1 christos eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 3946 1.1 christos 3947 1.1 christos mov d22, v4.d[1] //GHASH final-2 block - mid 3948 1.1 christos 3949 1.1 christos pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 3950 1.1 christos 3951 1.1 christos stp x6, x7, [x2], #16 //AES final-2 block - store result 3952 1.1 christos 3953 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 3954 1.1 christos mov x7, v0.d[1] //AES final-1 block - mov high 3955 1.1 christos 3956 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 3957 1.1 christos mov x6, v0.d[0] //AES final-1 block - mov low 3958 1.1 christos 3959 1.1 christos pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 3960 1.1 christos 3961 1.1 christos pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 3962 1.1 christos 3963 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 3964 1.1 christos eor x7, x7, x14 //AES final-1 block - round 12 high 3965 1.1 christos #ifdef __AARCH64EB__ 3966 1.1 christos rev x7, x7 3967 1.1 christos #endif 3968 1.1 christos eor x6, x6, x13 //AES final-1 block - round 12 low 3969 1.1 christos #ifdef __AARCH64EB__ 3970 1.1 christos rev x6, x6 3971 1.1 christos #endif 3972 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 3973 1.1 christos .L192_dec_blocks_more_than_1: //blocks left > 1 3974 1.1 christos 3975 1.1 christos rev64 v4.16b, v5.16b //GHASH final-1 block 3976 1.1 christos 3977 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 3978 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 3979 1.1 christos 3980 1.1 christos mov d22, v4.d[1] //GHASH final-1 block - mid 3981 1.1 christos 3982 1.1 christos pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 3983 1.1 christos 3984 1.1 christos eor v0.16b, v5.16b, v3.16b //AES final block - result 3985 1.1 christos stp x6, x7, [x2], #16 //AES final-1 block - store result 3986 1.1 christos 3987 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 3988 1.1 christos 3989 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 3990 1.1 christos 3991 1.1 christos pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 3992 1.1 christos mov x7, v0.d[1] //AES final block - mov high 3993 1.1 christos 3994 1.1 christos ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 3995 1.1 christos mov x6, v0.d[0] //AES final block - mov low 3996 1.1 christos 3997 1.1 christos pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 3998 1.1 christos 3999 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 4000 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 4001 1.1 christos eor x7, x7, x14 //AES final block - round 12 high 4002 1.1 christos #ifdef __AARCH64EB__ 4003 1.1 christos rev x7, x7 4004 1.1 christos #endif 4005 1.1 christos eor x6, x6, x13 //AES final block - round 12 low 4006 1.1 christos #ifdef __AARCH64EB__ 4007 1.1 christos rev x6, x6 4008 1.1 christos #endif 4009 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 4010 1.1 christos .L192_dec_blocks_less_than_1: //blocks left <= 1 4011 1.1 christos 4012 1.1 christos mvn x13, xzr //rk12_l = 0xffffffffffffffff 4013 1.1 christos ldp x4, x5, [x2] //load existing bytes we need to not overwrite 4014 1.1 christos and x1, x1, #127 //bit_length %= 128 4015 1.1 christos 4016 1.1 christos sub x1, x1, #128 //bit_length -= 128 4017 1.1 christos 4018 1.1 christos neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 4019 1.1 christos 4020 1.1 christos and x1, x1, #127 //bit_length %= 128 4021 1.1 christos mvn x14, xzr //rk12_h = 0xffffffffffffffff 4022 1.1 christos 4023 1.1 christos lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 4024 1.1 christos cmp x1, #64 4025 1.1 christos 4026 1.1 christos csel x9, x13, x14, lt 4027 1.1 christos csel x10, x14, xzr, lt 4028 1.1 christos 4029 1.1 christos fmov d0, x9 //ctr0b is mask for last block 4030 1.1 christos and x6, x6, x9 4031 1.1 christos bic x4, x4, x9 //mask out low existing bytes 4032 1.1 christos 4033 1.1 christos orr x6, x6, x4 4034 1.1 christos mov v0.d[1], x10 4035 1.1 christos #ifndef __AARCH64EB__ 4036 1.1 christos rev w9, w12 4037 1.1 christos #else 4038 1.1 christos mov w9, w12 4039 1.1 christos #endif 4040 1.1 christos 4041 1.1 christos and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 4042 1.1 christos str w9, [x16, #12] //store the updated counter 4043 1.1 christos 4044 1.1 christos rev64 v4.16b, v5.16b //GHASH final block 4045 1.1 christos 4046 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 4047 1.1 christos bic x5, x5, x10 //mask out high existing bytes 4048 1.1 christos 4049 1.1 christos and x7, x7, x10 4050 1.1 christos 4051 1.1 christos pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 4052 1.1 christos mov d8, v4.d[1] //GHASH final block - mid 4053 1.1 christos 4054 1.1 christos pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 4055 1.1 christos 4056 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 4057 1.1 christos 4058 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final block - high 4059 1.1 christos 4060 1.1 christos pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 4061 1.1 christos 4062 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final block - low 4063 1.1 christos 4064 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 4065 1.1 christos movi v8.8b, #0xc2 4066 1.1 christos 4067 1.1 christos eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4068 1.1 christos 4069 1.1 christos shl d8, d8, #56 //mod_constant 4070 1.1 christos 4071 1.1 christos eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 4072 1.1 christos 4073 1.1 christos pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4074 1.1 christos orr x7, x7, x5 4075 1.1 christos stp x6, x7, [x2] 4076 1.1 christos 4077 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4078 1.1 christos 4079 1.1 christos eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 4080 1.1 christos 4081 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 4082 1.1 christos 4083 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4084 1.1 christos 4085 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 4086 1.1 christos 4087 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4088 1.1 christos 4089 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4090 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 4091 1.1 christos rev64 v11.16b, v11.16b 4092 1.1 christos mov x0, x15 4093 1.1 christos st1 { v11.16b }, [x3] 4094 1.1 christos 4095 1.1 christos ldp x21, x22, [sp, #16] 4096 1.1 christos ldp x23, x24, [sp, #32] 4097 1.1 christos ldp d8, d9, [sp, #48] 4098 1.1 christos ldp d10, d11, [sp, #64] 4099 1.1 christos ldp d12, d13, [sp, #80] 4100 1.1 christos ldp d14, d15, [sp, #96] 4101 1.1 christos ldp x19, x20, [sp], #112 4102 1.1 christos ret 4103 1.1 christos 4104 1.1 christos .L192_dec_ret: 4105 1.1 christos mov w0, #0x0 4106 1.1 christos ret 4107 1.1 christos .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 4108 1.1 christos .globl aes_gcm_enc_256_kernel 4109 1.1 christos .type aes_gcm_enc_256_kernel,%function 4110 1.1 christos .align 4 4111 1.1 christos aes_gcm_enc_256_kernel: 4112 1.2 christos AARCH64_VALID_CALL_TARGET 4113 1.1 christos cbz x1, .L256_enc_ret 4114 1.1 christos stp x19, x20, [sp, #-112]! 4115 1.1 christos mov x16, x4 4116 1.1 christos mov x8, x5 4117 1.1 christos stp x21, x22, [sp, #16] 4118 1.1 christos stp x23, x24, [sp, #32] 4119 1.1 christos stp d8, d9, [sp, #48] 4120 1.1 christos stp d10, d11, [sp, #64] 4121 1.1 christos stp d12, d13, [sp, #80] 4122 1.1 christos stp d14, d15, [sp, #96] 4123 1.1 christos 4124 1.1 christos add x4, x0, x1, lsr #3 //end_input_ptr 4125 1.1 christos lsr x5, x1, #3 //byte_len 4126 1.1 christos mov x15, x5 4127 1.1 christos ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 4128 1.1 christos #ifdef __AARCH64EB__ 4129 1.1 christos rev x10, x10 4130 1.1 christos rev x11, x11 4131 1.1 christos #endif 4132 1.1 christos ldp x13, x14, [x8, #224] //load rk14 4133 1.1 christos #ifdef __AARCH64EB__ 4134 1.1 christos ror x13, x13, #32 4135 1.1 christos ror x14, x14, #32 4136 1.1 christos #endif 4137 1.1 christos ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 4138 1.1 christos sub x5, x5, #1 //byte_len - 1 4139 1.1 christos 4140 1.1 christos ld1 {v18.4s}, [x8], #16 //load rk0 4141 1.1 christos and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4142 1.1 christos 4143 1.1 christos ld1 {v19.4s}, [x8], #16 //load rk1 4144 1.1 christos add x5, x5, x0 4145 1.1 christos 4146 1.1 christos lsr x12, x11, #32 4147 1.1 christos fmov d2, x10 //CTR block 2 4148 1.1 christos orr w11, w11, w11 4149 1.1 christos 4150 1.1 christos rev w12, w12 //rev_ctr32 4151 1.1 christos cmp x0, x5 //check if we have <= 4 blocks 4152 1.1 christos fmov d1, x10 //CTR block 1 4153 1.1 christos 4154 1.1 christos aese v0.16b, v18.16b 4155 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 0 4156 1.1 christos add w12, w12, #1 //increment rev_ctr32 4157 1.1 christos 4158 1.1 christos rev w9, w12 //CTR block 1 4159 1.1 christos fmov d3, x10 //CTR block 3 4160 1.1 christos 4161 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 1 4162 1.1 christos add w12, w12, #1 //CTR block 1 4163 1.1 christos ld1 {v20.4s}, [x8], #16 //load rk2 4164 1.1 christos 4165 1.1 christos fmov v1.d[1], x9 //CTR block 1 4166 1.1 christos rev w9, w12 //CTR block 2 4167 1.1 christos add w12, w12, #1 //CTR block 2 4168 1.1 christos 4169 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 2 4170 1.1 christos ld1 {v21.4s}, [x8], #16 //load rk3 4171 1.1 christos 4172 1.1 christos fmov v2.d[1], x9 //CTR block 2 4173 1.1 christos rev w9, w12 //CTR block 3 4174 1.1 christos 4175 1.1 christos aese v0.16b, v19.16b 4176 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 1 4177 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 3 4178 1.1 christos 4179 1.1 christos fmov v3.d[1], x9 //CTR block 3 4180 1.1 christos 4181 1.1 christos aese v1.16b, v18.16b 4182 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 0 4183 1.1 christos ld1 {v22.4s}, [x8], #16 //load rk4 4184 1.1 christos 4185 1.1 christos aese v0.16b, v20.16b 4186 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 2 4187 1.1 christos ld1 {v23.4s}, [x8], #16 //load rk5 4188 1.1 christos 4189 1.1 christos aese v2.16b, v18.16b 4190 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 0 4191 1.1 christos ld1 {v24.4s}, [x8], #16 //load rk6 4192 1.1 christos 4193 1.1 christos aese v1.16b, v19.16b 4194 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 1 4195 1.1 christos ldr q14, [x3, #80] //load h3l | h3h 4196 1.1 christos #ifndef __AARCH64EB__ 4197 1.1 christos ext v14.16b, v14.16b, v14.16b, #8 4198 1.1 christos #endif 4199 1.1 christos aese v3.16b, v18.16b 4200 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 0 4201 1.1 christos ld1 {v25.4s}, [x8], #16 //load rk7 4202 1.1 christos 4203 1.1 christos aese v2.16b, v19.16b 4204 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 1 4205 1.1 christos ld1 {v26.4s}, [x8], #16 //load rk8 4206 1.1 christos 4207 1.1 christos aese v1.16b, v20.16b 4208 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 2 4209 1.1 christos ldr q13, [x3, #64] //load h2l | h2h 4210 1.1 christos #ifndef __AARCH64EB__ 4211 1.1 christos ext v13.16b, v13.16b, v13.16b, #8 4212 1.1 christos #endif 4213 1.1 christos aese v3.16b, v19.16b 4214 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 1 4215 1.1 christos ld1 {v27.4s}, [x8], #16 //load rk9 4216 1.1 christos 4217 1.1 christos aese v2.16b, v20.16b 4218 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 2 4219 1.1 christos ldr q15, [x3, #112] //load h4l | h4h 4220 1.1 christos #ifndef __AARCH64EB__ 4221 1.1 christos ext v15.16b, v15.16b, v15.16b, #8 4222 1.1 christos #endif 4223 1.1 christos aese v1.16b, v21.16b 4224 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 3 4225 1.1 christos ld1 {v28.4s}, [x8], #16 //load rk10 4226 1.1 christos 4227 1.1 christos aese v3.16b, v20.16b 4228 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 2 4229 1.1 christos ld1 {v29.4s}, [x8], #16 //load rk11 4230 1.1 christos 4231 1.1 christos aese v2.16b, v21.16b 4232 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 3 4233 1.1 christos add w12, w12, #1 //CTR block 3 4234 1.1 christos 4235 1.1 christos aese v0.16b, v21.16b 4236 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 3 4237 1.1 christos 4238 1.1 christos aese v3.16b, v21.16b 4239 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 3 4240 1.1 christos ld1 { v11.16b}, [x3] 4241 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 4242 1.1 christos rev64 v11.16b, v11.16b 4243 1.1 christos 4244 1.1 christos aese v2.16b, v22.16b 4245 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 4 4246 1.1 christos 4247 1.1 christos aese v0.16b, v22.16b 4248 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 4 4249 1.1 christos 4250 1.1 christos aese v1.16b, v22.16b 4251 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 4 4252 1.1 christos 4253 1.1 christos aese v3.16b, v22.16b 4254 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 4 4255 1.1 christos 4256 1.1 christos aese v0.16b, v23.16b 4257 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 5 4258 1.1 christos 4259 1.1 christos aese v1.16b, v23.16b 4260 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 5 4261 1.1 christos 4262 1.1 christos aese v3.16b, v23.16b 4263 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 5 4264 1.1 christos 4265 1.1 christos aese v2.16b, v23.16b 4266 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 5 4267 1.1 christos 4268 1.1 christos aese v1.16b, v24.16b 4269 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 6 4270 1.1 christos trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 4271 1.1 christos 4272 1.1 christos aese v3.16b, v24.16b 4273 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 6 4274 1.1 christos ld1 {v30.4s}, [x8], #16 //load rk12 4275 1.1 christos 4276 1.1 christos aese v0.16b, v24.16b 4277 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 6 4278 1.1 christos ldr q12, [x3, #32] //load h1l | h1h 4279 1.1 christos #ifndef __AARCH64EB__ 4280 1.1 christos ext v12.16b, v12.16b, v12.16b, #8 4281 1.1 christos #endif 4282 1.1 christos aese v2.16b, v24.16b 4283 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 6 4284 1.1 christos ld1 {v31.4s}, [x8], #16 //load rk13 4285 1.1 christos 4286 1.1 christos aese v1.16b, v25.16b 4287 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 7 4288 1.1 christos trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 4289 1.1 christos 4290 1.1 christos aese v0.16b, v25.16b 4291 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 7 4292 1.1 christos 4293 1.1 christos aese v2.16b, v25.16b 4294 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 7 4295 1.1 christos 4296 1.1 christos aese v3.16b, v25.16b 4297 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 7 4298 1.1 christos trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 4299 1.1 christos 4300 1.1 christos aese v1.16b, v26.16b 4301 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 8 4302 1.1 christos 4303 1.1 christos aese v2.16b, v26.16b 4304 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 8 4305 1.1 christos 4306 1.1 christos aese v3.16b, v26.16b 4307 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 8 4308 1.1 christos 4309 1.1 christos aese v1.16b, v27.16b 4310 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 9 4311 1.1 christos 4312 1.1 christos aese v2.16b, v27.16b 4313 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 9 4314 1.1 christos 4315 1.1 christos aese v0.16b, v26.16b 4316 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 8 4317 1.1 christos 4318 1.1 christos aese v1.16b, v28.16b 4319 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 10 4320 1.1 christos 4321 1.1 christos aese v3.16b, v27.16b 4322 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 9 4323 1.1 christos 4324 1.1 christos aese v0.16b, v27.16b 4325 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 9 4326 1.1 christos 4327 1.1 christos aese v2.16b, v28.16b 4328 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 10 4329 1.1 christos 4330 1.1 christos aese v3.16b, v28.16b 4331 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 10 4332 1.1 christos 4333 1.1 christos aese v1.16b, v29.16b 4334 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 11 4335 1.1 christos 4336 1.1 christos aese v2.16b, v29.16b 4337 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 11 4338 1.1 christos 4339 1.1 christos aese v0.16b, v28.16b 4340 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 10 4341 1.1 christos 4342 1.1 christos aese v1.16b, v30.16b 4343 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 12 4344 1.1 christos 4345 1.1 christos aese v2.16b, v30.16b 4346 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 12 4347 1.1 christos 4348 1.1 christos aese v0.16b, v29.16b 4349 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 11 4350 1.1 christos eor v17.16b, v17.16b, v9.16b //h4k | h3k 4351 1.1 christos 4352 1.1 christos aese v3.16b, v29.16b 4353 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 11 4354 1.1 christos 4355 1.1 christos aese v2.16b, v31.16b //AES block 2 - round 13 4356 1.1 christos trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 4357 1.1 christos 4358 1.1 christos aese v0.16b, v30.16b 4359 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 12 4360 1.1 christos 4361 1.1 christos aese v3.16b, v30.16b 4362 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 12 4363 1.1 christos 4364 1.1 christos aese v1.16b, v31.16b //AES block 1 - round 13 4365 1.1 christos 4366 1.1 christos aese v0.16b, v31.16b //AES block 0 - round 13 4367 1.1 christos 4368 1.1 christos aese v3.16b, v31.16b //AES block 3 - round 13 4369 1.1 christos eor v16.16b, v16.16b, v8.16b //h2k | h1k 4370 1.1 christos b.ge .L256_enc_tail //handle tail 4371 1.1 christos 4372 1.1 christos ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 4373 1.1 christos #ifdef __AARCH64EB__ 4374 1.1 christos rev x19, x19 4375 1.1 christos rev x20, x20 4376 1.1 christos #endif 4377 1.1 christos rev w9, w12 //CTR block 4 4378 1.1 christos ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 4379 1.1 christos #ifdef __AARCH64EB__ 4380 1.1 christos rev x6, x6 4381 1.1 christos rev x7, x7 4382 1.1 christos #endif 4383 1.1 christos ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 4384 1.1 christos #ifdef __AARCH64EB__ 4385 1.1 christos rev x23, x23 4386 1.1 christos rev x24, x24 4387 1.1 christos #endif 4388 1.1 christos ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 4389 1.1 christos #ifdef __AARCH64EB__ 4390 1.1 christos rev x21, x21 4391 1.1 christos rev x22, x22 4392 1.1 christos #endif 4393 1.1 christos add x0, x0, #64 //AES input_ptr update 4394 1.1 christos 4395 1.1 christos eor x19, x19, x13 //AES block 1 - round 14 low 4396 1.1 christos eor x20, x20, x14 //AES block 1 - round 14 high 4397 1.1 christos 4398 1.1 christos fmov d5, x19 //AES block 1 - mov low 4399 1.1 christos eor x6, x6, x13 //AES block 0 - round 14 low 4400 1.1 christos 4401 1.1 christos eor x7, x7, x14 //AES block 0 - round 14 high 4402 1.1 christos eor x24, x24, x14 //AES block 3 - round 14 high 4403 1.1 christos fmov d4, x6 //AES block 0 - mov low 4404 1.1 christos 4405 1.1 christos cmp x0, x5 //check if we have <= 8 blocks 4406 1.1 christos fmov v4.d[1], x7 //AES block 0 - mov high 4407 1.1 christos eor x23, x23, x13 //AES block 3 - round 14 low 4408 1.1 christos 4409 1.1 christos eor x21, x21, x13 //AES block 2 - round 14 low 4410 1.1 christos fmov v5.d[1], x20 //AES block 1 - mov high 4411 1.1 christos 4412 1.1 christos fmov d6, x21 //AES block 2 - mov low 4413 1.1 christos add w12, w12, #1 //CTR block 4 4414 1.1 christos 4415 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4 4416 1.1 christos fmov d7, x23 //AES block 3 - mov low 4417 1.1 christos eor x22, x22, x14 //AES block 2 - round 14 high 4418 1.1 christos 4419 1.1 christos fmov v6.d[1], x22 //AES block 2 - mov high 4420 1.1 christos 4421 1.1 christos eor v4.16b, v4.16b, v0.16b //AES block 0 - result 4422 1.1 christos fmov d0, x10 //CTR block 4 4423 1.1 christos 4424 1.1 christos fmov v0.d[1], x9 //CTR block 4 4425 1.1 christos rev w9, w12 //CTR block 5 4426 1.1 christos add w12, w12, #1 //CTR block 5 4427 1.1 christos 4428 1.1 christos eor v5.16b, v5.16b, v1.16b //AES block 1 - result 4429 1.1 christos fmov d1, x10 //CTR block 5 4430 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 5 4431 1.1 christos 4432 1.1 christos fmov v1.d[1], x9 //CTR block 5 4433 1.1 christos rev w9, w12 //CTR block 6 4434 1.1 christos st1 { v4.16b}, [x2], #16 //AES block 0 - store result 4435 1.1 christos 4436 1.1 christos fmov v7.d[1], x24 //AES block 3 - mov high 4437 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 6 4438 1.1 christos eor v6.16b, v6.16b, v2.16b //AES block 2 - result 4439 1.1 christos 4440 1.1 christos st1 { v5.16b}, [x2], #16 //AES block 1 - store result 4441 1.1 christos 4442 1.1 christos add w12, w12, #1 //CTR block 6 4443 1.1 christos fmov d2, x10 //CTR block 6 4444 1.1 christos 4445 1.1 christos fmov v2.d[1], x9 //CTR block 6 4446 1.1 christos st1 { v6.16b}, [x2], #16 //AES block 2 - store result 4447 1.1 christos rev w9, w12 //CTR block 7 4448 1.1 christos 4449 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 7 4450 1.1 christos 4451 1.1 christos eor v7.16b, v7.16b, v3.16b //AES block 3 - result 4452 1.1 christos st1 { v7.16b}, [x2], #16 //AES block 3 - store result 4453 1.1 christos b.ge .L256_enc_prepretail //do prepretail 4454 1.1 christos 4455 1.1 christos .L256_enc_main_loop: //main loop start 4456 1.1 christos aese v0.16b, v18.16b 4457 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4458 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4459 1.1 christos 4460 1.1 christos aese v1.16b, v18.16b 4461 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4462 1.1 christos fmov d3, x10 //CTR block 4k+3 4463 1.1 christos 4464 1.1 christos aese v2.16b, v18.16b 4465 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4466 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4467 1.1 christos 4468 1.1 christos aese v0.16b, v19.16b 4469 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4470 1.1 christos fmov v3.d[1], x9 //CTR block 4k+3 4471 1.1 christos 4472 1.1 christos aese v1.16b, v19.16b 4473 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4474 1.1 christos ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext 4475 1.1 christos #ifdef __AARCH64EB__ 4476 1.1 christos rev x23, x23 4477 1.1 christos rev x24, x24 4478 1.1 christos #endif 4479 1.1 christos aese v2.16b, v19.16b 4480 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4481 1.1 christos ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 4482 1.1 christos #ifdef __AARCH64EB__ 4483 1.1 christos rev x21, x21 4484 1.1 christos rev x22, x22 4485 1.1 christos #endif 4486 1.1 christos aese v0.16b, v20.16b 4487 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4488 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 4489 1.1 christos 4490 1.1 christos aese v1.16b, v20.16b 4491 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4492 1.1 christos 4493 1.1 christos aese v3.16b, v18.16b 4494 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4495 1.1 christos eor x23, x23, x13 //AES block 4k+7 - round 14 low 4496 1.1 christos 4497 1.1 christos aese v0.16b, v21.16b 4498 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4499 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 4500 1.1 christos 4501 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4502 1.1 christos eor x22, x22, x14 //AES block 4k+6 - round 14 high 4503 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 4504 1.1 christos 4505 1.1 christos aese v3.16b, v19.16b 4506 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4507 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4508 1.1 christos 4509 1.1 christos aese v0.16b, v22.16b 4510 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4511 1.1 christos 4512 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4513 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4514 1.1 christos 4515 1.1 christos aese v2.16b, v20.16b 4516 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4517 1.1 christos 4518 1.1 christos aese v0.16b, v23.16b 4519 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4520 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4521 1.1 christos 4522 1.1 christos pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4523 1.1 christos 4524 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4525 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4526 1.1 christos 4527 1.1 christos pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4528 1.1 christos 4529 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4530 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 4531 1.1 christos 4532 1.1 christos aese v1.16b, v21.16b 4533 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4534 1.1 christos 4535 1.1 christos aese v3.16b, v20.16b 4536 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4537 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4538 1.1 christos 4539 1.1 christos aese v2.16b, v21.16b 4540 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4541 1.1 christos 4542 1.1 christos aese v1.16b, v22.16b 4543 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4544 1.1 christos mov d8, v6.d[1] //GHASH block 4k+2 - mid 4545 1.1 christos 4546 1.1 christos aese v3.16b, v21.16b 4547 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4548 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4549 1.1 christos 4550 1.1 christos aese v2.16b, v22.16b 4551 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4552 1.1 christos 4553 1.1 christos aese v0.16b, v24.16b 4554 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4555 1.1 christos eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4556 1.1 christos 4557 1.1 christos aese v3.16b, v22.16b 4558 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4559 1.1 christos 4560 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4561 1.1 christos 4562 1.1 christos aese v0.16b, v25.16b 4563 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4564 1.1 christos 4565 1.1 christos aese v3.16b, v23.16b 4566 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4567 1.1 christos ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4568 1.1 christos 4569 1.1 christos aese v1.16b, v23.16b 4570 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4571 1.1 christos 4572 1.1 christos aese v0.16b, v26.16b 4573 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4574 1.1 christos 4575 1.1 christos aese v2.16b, v23.16b 4576 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4577 1.1 christos 4578 1.1 christos aese v1.16b, v24.16b 4579 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4580 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4581 1.1 christos 4582 1.1 christos pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4583 1.1 christos 4584 1.1 christos pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4585 1.1 christos 4586 1.1 christos aese v1.16b, v25.16b 4587 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4588 1.1 christos 4589 1.1 christos pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4590 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4591 1.1 christos 4592 1.1 christos aese v3.16b, v24.16b 4593 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4594 1.1 christos ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 4595 1.1 christos #ifdef __AARCH64EB__ 4596 1.1 christos rev x19, x19 4597 1.1 christos rev x20, x20 4598 1.1 christos #endif 4599 1.1 christos aese v1.16b, v26.16b 4600 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4601 1.1 christos mov d4, v7.d[1] //GHASH block 4k+3 - mid 4602 1.1 christos 4603 1.1 christos aese v2.16b, v24.16b 4604 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4605 1.1 christos eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4606 1.1 christos 4607 1.1 christos pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4608 1.1 christos 4609 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4610 1.1 christos eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4611 1.1 christos 4612 1.1 christos aese v2.16b, v25.16b 4613 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4614 1.1 christos eor x19, x19, x13 //AES block 4k+5 - round 14 low 4615 1.1 christos 4616 1.1 christos aese v1.16b, v27.16b 4617 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4618 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4619 1.1 christos 4620 1.1 christos aese v3.16b, v25.16b 4621 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4622 1.1 christos eor x21, x21, x13 //AES block 4k+6 - round 14 low 4623 1.1 christos 4624 1.1 christos aese v0.16b, v27.16b 4625 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4626 1.1 christos movi v8.8b, #0xc2 4627 1.1 christos 4628 1.1 christos pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4629 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4630 1.1 christos fmov d5, x19 //AES block 4k+5 - mov low 4631 1.1 christos 4632 1.1 christos aese v2.16b, v26.16b 4633 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4634 1.1 christos ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 4635 1.1 christos #ifdef __AARCH64EB__ 4636 1.1 christos rev x6, x6 4637 1.1 christos rev x7, x7 4638 1.1 christos #endif 4639 1.1 christos aese v0.16b, v28.16b 4640 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4641 1.1 christos shl d8, d8, #56 //mod_constant 4642 1.1 christos 4643 1.1 christos aese v3.16b, v26.16b 4644 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4645 1.1 christos eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4646 1.1 christos 4647 1.1 christos aese v2.16b, v27.16b 4648 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4649 1.1 christos 4650 1.1 christos aese v1.16b, v28.16b 4651 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4652 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4653 1.1 christos 4654 1.1 christos aese v3.16b, v27.16b 4655 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4656 1.1 christos add w12, w12, #1 //CTR block 4k+3 4657 1.1 christos 4658 1.1 christos aese v0.16b, v29.16b 4659 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4660 1.1 christos eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4661 1.1 christos 4662 1.1 christos aese v1.16b, v29.16b 4663 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4664 1.1 christos add x0, x0, #64 //AES input_ptr update 4665 1.1 christos 4666 1.1 christos pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4667 1.1 christos rev w9, w12 //CTR block 4k+8 4668 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4669 1.1 christos 4670 1.1 christos aese v2.16b, v28.16b 4671 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4672 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 14 low 4673 1.1 christos 4674 1.1 christos aese v1.16b, v30.16b 4675 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4676 1.1 christos eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 4677 1.1 christos 4678 1.1 christos aese v3.16b, v28.16b 4679 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4680 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 14 high 4681 1.1 christos 4682 1.1 christos fmov d4, x6 //AES block 4k+4 - mov low 4683 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+8 4684 1.1 christos eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid 4685 1.1 christos 4686 1.1 christos aese v0.16b, v30.16b 4687 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4688 1.1 christos eor x20, x20, x14 //AES block 4k+5 - round 14 high 4689 1.1 christos 4690 1.1 christos aese v2.16b, v29.16b 4691 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4692 1.1 christos eor x24, x24, x14 //AES block 4k+7 - round 14 high 4693 1.1 christos 4694 1.1 christos aese v3.16b, v29.16b 4695 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4696 1.1 christos add w12, w12, #1 //CTR block 4k+8 4697 1.1 christos 4698 1.1 christos aese v0.16b, v31.16b //AES block 4k+4 - round 13 4699 1.1 christos fmov v4.d[1], x7 //AES block 4k+4 - mov high 4700 1.1 christos eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 4701 1.1 christos 4702 1.1 christos aese v2.16b, v30.16b 4703 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4704 1.1 christos fmov d7, x23 //AES block 4k+7 - mov low 4705 1.1 christos 4706 1.1 christos aese v1.16b, v31.16b //AES block 4k+5 - round 13 4707 1.1 christos fmov v5.d[1], x20 //AES block 4k+5 - mov high 4708 1.1 christos 4709 1.1 christos fmov d6, x21 //AES block 4k+6 - mov low 4710 1.1 christos cmp x0, x5 //.LOOP CONTROL 4711 1.1 christos 4712 1.1 christos fmov v6.d[1], x22 //AES block 4k+6 - mov high 4713 1.1 christos 4714 1.1 christos pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4715 1.1 christos eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 4716 1.1 christos fmov d0, x10 //CTR block 4k+8 4717 1.1 christos 4718 1.1 christos fmov v0.d[1], x9 //CTR block 4k+8 4719 1.1 christos rev w9, w12 //CTR block 4k+9 4720 1.1 christos add w12, w12, #1 //CTR block 4k+9 4721 1.1 christos 4722 1.1 christos eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 4723 1.1 christos fmov d1, x10 //CTR block 4k+9 4724 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+9 4725 1.1 christos 4726 1.1 christos aese v3.16b, v30.16b 4727 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4728 1.1 christos fmov v1.d[1], x9 //CTR block 4k+9 4729 1.1 christos 4730 1.1 christos aese v2.16b, v31.16b //AES block 4k+6 - round 13 4731 1.1 christos rev w9, w12 //CTR block 4k+10 4732 1.1 christos st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 4733 1.1 christos 4734 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+10 4735 1.1 christos eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 4736 1.1 christos fmov v7.d[1], x24 //AES block 4k+7 - mov high 4737 1.1 christos 4738 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4739 1.1 christos st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 4740 1.1 christos add w12, w12, #1 //CTR block 4k+10 4741 1.1 christos 4742 1.1 christos aese v3.16b, v31.16b //AES block 4k+7 - round 13 4743 1.1 christos eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 4744 1.1 christos fmov d2, x10 //CTR block 4k+10 4745 1.1 christos 4746 1.1 christos st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 4747 1.1 christos fmov v2.d[1], x9 //CTR block 4k+10 4748 1.1 christos rev w9, w12 //CTR block 4k+11 4749 1.1 christos 4750 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4751 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+11 4752 1.1 christos 4753 1.1 christos eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result 4754 1.1 christos st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result 4755 1.1 christos b.lt .L256_enc_main_loop 4756 1.1 christos 4757 1.1 christos .L256_enc_prepretail: //PREPRETAIL 4758 1.1 christos aese v1.16b, v18.16b 4759 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4760 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4761 1.1 christos 4762 1.1 christos aese v2.16b, v18.16b 4763 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4764 1.1 christos fmov d3, x10 //CTR block 4k+3 4765 1.1 christos 4766 1.1 christos aese v0.16b, v18.16b 4767 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4768 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4769 1.1 christos 4770 1.1 christos fmov v3.d[1], x9 //CTR block 4k+3 4771 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4772 1.1 christos 4773 1.1 christos aese v2.16b, v19.16b 4774 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4775 1.1 christos 4776 1.1 christos aese v0.16b, v19.16b 4777 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4778 1.1 christos 4779 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 4780 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4781 1.1 christos 4782 1.1 christos aese v2.16b, v20.16b 4783 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4784 1.1 christos 4785 1.1 christos aese v3.16b, v18.16b 4786 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4787 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 4788 1.1 christos 4789 1.1 christos aese v1.16b, v19.16b 4790 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4791 1.1 christos 4792 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4793 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 4794 1.1 christos 4795 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4796 1.1 christos 4797 1.1 christos aese v2.16b, v21.16b 4798 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4799 1.1 christos 4800 1.1 christos aese v1.16b, v20.16b 4801 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4802 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4803 1.1 christos 4804 1.1 christos aese v0.16b, v20.16b 4805 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4806 1.1 christos 4807 1.1 christos aese v3.16b, v19.16b 4808 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4809 1.1 christos 4810 1.1 christos aese v1.16b, v21.16b 4811 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4812 1.1 christos 4813 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4814 1.1 christos 4815 1.1 christos pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4816 1.1 christos 4817 1.1 christos pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4818 1.1 christos 4819 1.1 christos aese v3.16b, v20.16b 4820 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4821 1.1 christos 4822 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4823 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 4824 1.1 christos 4825 1.1 christos aese v0.16b, v21.16b 4826 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4827 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4828 1.1 christos 4829 1.1 christos aese v3.16b, v21.16b 4830 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4831 1.1 christos 4832 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4833 1.1 christos mov d8, v6.d[1] //GHASH block 4k+2 - mid 4834 1.1 christos 4835 1.1 christos aese v0.16b, v22.16b 4836 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4837 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4838 1.1 christos 4839 1.1 christos aese v3.16b, v22.16b 4840 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4841 1.1 christos 4842 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4843 1.1 christos eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4844 1.1 christos add w12, w12, #1 //CTR block 4k+3 4845 1.1 christos 4846 1.1 christos pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4847 1.1 christos 4848 1.1 christos aese v3.16b, v23.16b 4849 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4850 1.1 christos 4851 1.1 christos aese v2.16b, v22.16b 4852 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4853 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4854 1.1 christos 4855 1.1 christos pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4856 1.1 christos 4857 1.1 christos eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4858 1.1 christos ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4859 1.1 christos 4860 1.1 christos aese v2.16b, v23.16b 4861 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4862 1.1 christos 4863 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4864 1.1 christos mov d4, v7.d[1] //GHASH block 4k+3 - mid 4865 1.1 christos 4866 1.1 christos aese v1.16b, v22.16b 4867 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4868 1.1 christos 4869 1.1 christos pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4870 1.1 christos 4871 1.1 christos eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4872 1.1 christos 4873 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4874 1.1 christos 4875 1.1 christos aese v1.16b, v23.16b 4876 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4877 1.1 christos 4878 1.1 christos pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4879 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4880 1.1 christos 4881 1.1 christos aese v0.16b, v23.16b 4882 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4883 1.1 christos 4884 1.1 christos aese v1.16b, v24.16b 4885 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4886 1.1 christos 4887 1.1 christos aese v2.16b, v24.16b 4888 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4889 1.1 christos 4890 1.1 christos aese v0.16b, v24.16b 4891 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4892 1.1 christos movi v8.8b, #0xc2 4893 1.1 christos 4894 1.1 christos aese v3.16b, v24.16b 4895 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4896 1.1 christos 4897 1.1 christos aese v1.16b, v25.16b 4898 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4899 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4900 1.1 christos 4901 1.1 christos aese v0.16b, v25.16b 4902 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4903 1.1 christos 4904 1.1 christos aese v3.16b, v25.16b 4905 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4906 1.1 christos shl d8, d8, #56 //mod_constant 4907 1.1 christos 4908 1.1 christos aese v1.16b, v26.16b 4909 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4910 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4911 1.1 christos 4912 1.1 christos pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4913 1.1 christos 4914 1.1 christos aese v3.16b, v26.16b 4915 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4916 1.1 christos 4917 1.1 christos aese v1.16b, v27.16b 4918 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4919 1.1 christos 4920 1.1 christos aese v0.16b, v26.16b 4921 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4922 1.1 christos eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4923 1.1 christos 4924 1.1 christos aese v3.16b, v27.16b 4925 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4926 1.1 christos 4927 1.1 christos eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 4928 1.1 christos 4929 1.1 christos pmull v4.1q, v9.1d, v8.1d 4930 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 4931 1.1 christos 4932 1.1 christos aese v3.16b, v28.16b 4933 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4934 1.1 christos 4935 1.1 christos aese v2.16b, v25.16b 4936 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4937 1.1 christos eor v10.16b, v10.16b, v11.16b 4938 1.1 christos 4939 1.1 christos aese v1.16b, v28.16b 4940 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4941 1.1 christos 4942 1.1 christos aese v0.16b, v27.16b 4943 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4944 1.1 christos 4945 1.1 christos aese v2.16b, v26.16b 4946 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4947 1.1 christos 4948 1.1 christos aese v1.16b, v29.16b 4949 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4950 1.1 christos eor v10.16b, v10.16b, v4.16b 4951 1.1 christos 4952 1.1 christos aese v0.16b, v28.16b 4953 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4954 1.1 christos 4955 1.1 christos aese v2.16b, v27.16b 4956 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4957 1.1 christos 4958 1.1 christos aese v1.16b, v30.16b 4959 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4960 1.1 christos 4961 1.1 christos aese v0.16b, v29.16b 4962 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4963 1.1 christos eor v10.16b, v10.16b, v9.16b 4964 1.1 christos 4965 1.1 christos aese v3.16b, v29.16b 4966 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4967 1.1 christos 4968 1.1 christos aese v2.16b, v28.16b 4969 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4970 1.1 christos 4971 1.1 christos aese v0.16b, v30.16b 4972 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4973 1.1 christos 4974 1.1 christos pmull v4.1q, v10.1d, v8.1d 4975 1.1 christos 4976 1.1 christos aese v2.16b, v29.16b 4977 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4978 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 4979 1.1 christos 4980 1.1 christos aese v3.16b, v30.16b 4981 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4982 1.1 christos 4983 1.1 christos aese v1.16b, v31.16b //AES block 4k+5 - round 13 4984 1.1 christos eor v11.16b, v11.16b, v4.16b 4985 1.1 christos 4986 1.1 christos aese v2.16b, v30.16b 4987 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4988 1.1 christos 4989 1.1 christos aese v3.16b, v31.16b //AES block 4k+7 - round 13 4990 1.1 christos 4991 1.1 christos aese v0.16b, v31.16b //AES block 4k+4 - round 13 4992 1.1 christos 4993 1.1 christos aese v2.16b, v31.16b //AES block 4k+6 - round 13 4994 1.1 christos eor v11.16b, v11.16b, v10.16b 4995 1.1 christos .L256_enc_tail: //TAIL 4996 1.1 christos 4997 1.1 christos ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 4998 1.1 christos sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 4999 1.1 christos ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 5000 1.1 christos #ifdef __AARCH64EB__ 5001 1.1 christos rev x6, x6 5002 1.1 christos rev x7, x7 5003 1.1 christos #endif 5004 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 14 low 5005 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 14 high 5006 1.1 christos 5007 1.1 christos cmp x5, #48 5008 1.1 christos fmov d4, x6 //AES block 4k+4 - mov low 5009 1.1 christos 5010 1.1 christos fmov v4.d[1], x7 //AES block 4k+4 - mov high 5011 1.1 christos 5012 1.1 christos eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 5013 1.1 christos b.gt .L256_enc_blocks_more_than_3 5014 1.1 christos 5015 1.1 christos cmp x5, #32 5016 1.1 christos mov v3.16b, v2.16b 5017 1.1 christos movi v11.8b, #0 5018 1.1 christos 5019 1.1 christos movi v9.8b, #0 5020 1.1 christos sub w12, w12, #1 5021 1.1 christos 5022 1.1 christos mov v2.16b, v1.16b 5023 1.1 christos movi v10.8b, #0 5024 1.1 christos b.gt .L256_enc_blocks_more_than_2 5025 1.1 christos 5026 1.1 christos mov v3.16b, v1.16b 5027 1.1 christos sub w12, w12, #1 5028 1.1 christos cmp x5, #16 5029 1.1 christos 5030 1.1 christos b.gt .L256_enc_blocks_more_than_1 5031 1.1 christos 5032 1.1 christos sub w12, w12, #1 5033 1.1 christos b .L256_enc_blocks_less_than_1 5034 1.1 christos .L256_enc_blocks_more_than_3: //blocks left > 3 5035 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 5036 1.1 christos 5037 1.1 christos ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 5038 1.1 christos #ifdef __AARCH64EB__ 5039 1.1 christos rev x6, x6 5040 1.1 christos rev x7, x7 5041 1.1 christos #endif 5042 1.1 christos rev64 v4.16b, v5.16b //GHASH final-3 block 5043 1.1 christos 5044 1.1 christos eor x6, x6, x13 //AES final-2 block - round 14 low 5045 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 5046 1.1 christos 5047 1.1 christos eor x7, x7, x14 //AES final-2 block - round 14 high 5048 1.1 christos 5049 1.1 christos mov d22, v4.d[1] //GHASH final-3 block - mid 5050 1.1 christos fmov d5, x6 //AES final-2 block - mov low 5051 1.1 christos 5052 1.1 christos fmov v5.d[1], x7 //AES final-2 block - mov high 5053 1.1 christos 5054 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 5055 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 5056 1.1 christos 5057 1.1 christos mov d10, v17.d[1] //GHASH final-3 block - mid 5058 1.1 christos 5059 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 5060 1.1 christos 5061 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 5062 1.1 christos 5063 1.1 christos pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 5064 1.1 christos eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 5065 1.1 christos .L256_enc_blocks_more_than_2: //blocks left > 2 5066 1.1 christos 5067 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 5068 1.1 christos 5069 1.1 christos ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 5070 1.1 christos #ifdef __AARCH64EB__ 5071 1.1 christos rev x6, x6 5072 1.1 christos rev x7, x7 5073 1.1 christos #endif 5074 1.1 christos rev64 v4.16b, v5.16b //GHASH final-2 block 5075 1.1 christos 5076 1.1 christos eor x6, x6, x13 //AES final-1 block - round 14 low 5077 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 5078 1.1 christos 5079 1.1 christos fmov d5, x6 //AES final-1 block - mov low 5080 1.1 christos eor x7, x7, x14 //AES final-1 block - round 14 high 5081 1.1 christos 5082 1.1 christos fmov v5.d[1], x7 //AES final-1 block - mov high 5083 1.1 christos 5084 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 5085 1.1 christos 5086 1.1 christos pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 5087 1.1 christos mov d22, v4.d[1] //GHASH final-2 block - mid 5088 1.1 christos 5089 1.1 christos pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 5090 1.1 christos 5091 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 5092 1.1 christos 5093 1.1 christos eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 5094 1.1 christos 5095 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 5096 1.1 christos 5097 1.1 christos pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 5098 1.1 christos 5099 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 5100 1.1 christos 5101 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 5102 1.1 christos .L256_enc_blocks_more_than_1: //blocks left > 1 5103 1.1 christos 5104 1.1 christos st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 5105 1.1 christos 5106 1.1 christos rev64 v4.16b, v5.16b //GHASH final-1 block 5107 1.1 christos 5108 1.1 christos ldp x6, x7, [x0], #16 //AES final block - load input low & high 5109 1.1 christos #ifdef __AARCH64EB__ 5110 1.1 christos rev x6, x6 5111 1.1 christos rev x7, x7 5112 1.1 christos #endif 5113 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 5114 1.1 christos 5115 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 5116 1.1 christos 5117 1.1 christos eor x6, x6, x13 //AES final block - round 14 low 5118 1.1 christos mov d22, v4.d[1] //GHASH final-1 block - mid 5119 1.1 christos 5120 1.1 christos pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 5121 1.1 christos eor x7, x7, x14 //AES final block - round 14 high 5122 1.1 christos 5123 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 5124 1.1 christos 5125 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 5126 1.1 christos 5127 1.1 christos ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 5128 1.1 christos fmov d5, x6 //AES final block - mov low 5129 1.1 christos 5130 1.1 christos fmov v5.d[1], x7 //AES final block - mov high 5131 1.1 christos 5132 1.1 christos pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 5133 1.1 christos 5134 1.1 christos pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 5135 1.1 christos 5136 1.1 christos eor v5.16b, v5.16b, v3.16b //AES final block - result 5137 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 5138 1.1 christos 5139 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 5140 1.1 christos .L256_enc_blocks_less_than_1: //blocks left <= 1 5141 1.1 christos 5142 1.1 christos and x1, x1, #127 //bit_length %= 128 5143 1.1 christos 5144 1.1 christos mvn x13, xzr //rk14_l = 0xffffffffffffffff 5145 1.1 christos sub x1, x1, #128 //bit_length -= 128 5146 1.1 christos 5147 1.1 christos neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 5148 1.1 christos ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 5149 1.1 christos 5150 1.1 christos mvn x14, xzr //rk14_h = 0xffffffffffffffff 5151 1.1 christos and x1, x1, #127 //bit_length %= 128 5152 1.1 christos 5153 1.1 christos lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 5154 1.1 christos cmp x1, #64 5155 1.1 christos 5156 1.1 christos csel x6, x13, x14, lt 5157 1.1 christos csel x7, x14, xzr, lt 5158 1.1 christos 5159 1.1 christos fmov d0, x6 //ctr0b is mask for last block 5160 1.1 christos 5161 1.1 christos fmov v0.d[1], x7 5162 1.1 christos 5163 1.1 christos and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 5164 1.1 christos 5165 1.1 christos rev64 v4.16b, v5.16b //GHASH final block 5166 1.1 christos 5167 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 5168 1.1 christos 5169 1.1 christos bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 5170 1.1 christos 5171 1.1 christos pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 5172 1.1 christos mov d8, v4.d[1] //GHASH final block - mid 5173 1.1 christos #ifndef __AARCH64EB__ 5174 1.1 christos rev w9, w12 5175 1.1 christos #else 5176 1.1 christos mov w9, w12 5177 1.1 christos #endif 5178 1.1 christos 5179 1.1 christos pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 5180 1.1 christos 5181 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final block - high 5182 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 5183 1.1 christos 5184 1.1 christos pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 5185 1.1 christos 5186 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final block - low 5187 1.1 christos 5188 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 5189 1.1 christos movi v8.8b, #0xc2 5190 1.1 christos 5191 1.1 christos eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5192 1.1 christos 5193 1.1 christos shl d8, d8, #56 //mod_constant 5194 1.1 christos 5195 1.1 christos eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 5196 1.1 christos 5197 1.1 christos pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5198 1.1 christos 5199 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5200 1.1 christos 5201 1.1 christos eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5202 1.1 christos 5203 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5204 1.1 christos 5205 1.1 christos pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5206 1.1 christos 5207 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5208 1.1 christos 5209 1.1 christos str w9, [x16, #12] //store the updated counter 5210 1.1 christos 5211 1.1 christos st1 { v5.16b}, [x2] //store all 16B 5212 1.1 christos eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 5213 1.1 christos 5214 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5215 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 5216 1.1 christos rev64 v11.16b, v11.16b 5217 1.1 christos mov x0, x15 5218 1.1 christos st1 { v11.16b }, [x3] 5219 1.1 christos 5220 1.1 christos ldp x21, x22, [sp, #16] 5221 1.1 christos ldp x23, x24, [sp, #32] 5222 1.1 christos ldp d8, d9, [sp, #48] 5223 1.1 christos ldp d10, d11, [sp, #64] 5224 1.1 christos ldp d12, d13, [sp, #80] 5225 1.1 christos ldp d14, d15, [sp, #96] 5226 1.1 christos ldp x19, x20, [sp], #112 5227 1.1 christos ret 5228 1.1 christos 5229 1.1 christos .L256_enc_ret: 5230 1.1 christos mov w0, #0x0 5231 1.1 christos ret 5232 1.1 christos .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5233 1.1 christos .globl aes_gcm_dec_256_kernel 5234 1.1 christos .type aes_gcm_dec_256_kernel,%function 5235 1.1 christos .align 4 5236 1.1 christos aes_gcm_dec_256_kernel: 5237 1.2 christos AARCH64_VALID_CALL_TARGET 5238 1.1 christos cbz x1, .L256_dec_ret 5239 1.1 christos stp x19, x20, [sp, #-112]! 5240 1.1 christos mov x16, x4 5241 1.1 christos mov x8, x5 5242 1.1 christos stp x21, x22, [sp, #16] 5243 1.1 christos stp x23, x24, [sp, #32] 5244 1.1 christos stp d8, d9, [sp, #48] 5245 1.1 christos stp d10, d11, [sp, #64] 5246 1.1 christos stp d12, d13, [sp, #80] 5247 1.1 christos stp d14, d15, [sp, #96] 5248 1.1 christos 5249 1.1 christos lsr x5, x1, #3 //byte_len 5250 1.1 christos mov x15, x5 5251 1.1 christos ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 5252 1.1 christos #ifdef __AARCH64EB__ 5253 1.1 christos rev x10, x10 5254 1.1 christos rev x11, x11 5255 1.1 christos #endif 5256 1.1 christos ldp x13, x14, [x8, #224] //load rk14 5257 1.1 christos #ifdef __AARCH64EB__ 5258 1.1 christos ror x14, x14, #32 5259 1.1 christos ror x13, x13, #32 5260 1.1 christos #endif 5261 1.1 christos ld1 {v18.4s}, [x8], #16 //load rk0 5262 1.1 christos sub x5, x5, #1 //byte_len - 1 5263 1.1 christos 5264 1.1 christos ld1 {v19.4s}, [x8], #16 //load rk1 5265 1.1 christos and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5266 1.1 christos 5267 1.1 christos add x4, x0, x1, lsr #3 //end_input_ptr 5268 1.1 christos ld1 {v20.4s}, [x8], #16 //load rk2 5269 1.1 christos 5270 1.1 christos lsr x12, x11, #32 5271 1.1 christos ld1 {v21.4s}, [x8], #16 //load rk3 5272 1.1 christos orr w11, w11, w11 5273 1.1 christos 5274 1.1 christos ld1 {v22.4s}, [x8], #16 //load rk4 5275 1.1 christos add x5, x5, x0 5276 1.1 christos rev w12, w12 //rev_ctr32 5277 1.1 christos 5278 1.1 christos add w12, w12, #1 //increment rev_ctr32 5279 1.1 christos fmov d3, x10 //CTR block 3 5280 1.1 christos 5281 1.1 christos rev w9, w12 //CTR block 1 5282 1.1 christos add w12, w12, #1 //CTR block 1 5283 1.1 christos fmov d1, x10 //CTR block 1 5284 1.1 christos 5285 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 1 5286 1.1 christos ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 5287 1.1 christos 5288 1.1 christos fmov v1.d[1], x9 //CTR block 1 5289 1.1 christos rev w9, w12 //CTR block 2 5290 1.1 christos add w12, w12, #1 //CTR block 2 5291 1.1 christos 5292 1.1 christos fmov d2, x10 //CTR block 2 5293 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 2 5294 1.1 christos 5295 1.1 christos fmov v2.d[1], x9 //CTR block 2 5296 1.1 christos rev w9, w12 //CTR block 3 5297 1.1 christos 5298 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 3 5299 1.1 christos ld1 {v23.4s}, [x8], #16 //load rk5 5300 1.1 christos 5301 1.1 christos fmov v3.d[1], x9 //CTR block 3 5302 1.1 christos add w12, w12, #1 //CTR block 3 5303 1.1 christos 5304 1.1 christos ld1 {v24.4s}, [x8], #16 //load rk6 5305 1.1 christos 5306 1.1 christos ld1 {v25.4s}, [x8], #16 //load rk7 5307 1.1 christos 5308 1.1 christos ld1 {v26.4s}, [x8], #16 //load rk8 5309 1.1 christos 5310 1.1 christos aese v0.16b, v18.16b 5311 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 0 5312 1.1 christos ldr q14, [x3, #80] //load h3l | h3h 5313 1.1 christos #ifndef __AARCH64EB__ 5314 1.1 christos ext v14.16b, v14.16b, v14.16b, #8 5315 1.1 christos #endif 5316 1.1 christos 5317 1.1 christos aese v3.16b, v18.16b 5318 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 0 5319 1.1 christos ldr q15, [x3, #112] //load h4l | h4h 5320 1.1 christos #ifndef __AARCH64EB__ 5321 1.1 christos ext v15.16b, v15.16b, v15.16b, #8 5322 1.1 christos #endif 5323 1.1 christos 5324 1.1 christos aese v1.16b, v18.16b 5325 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 0 5326 1.1 christos ldr q13, [x3, #64] //load h2l | h2h 5327 1.1 christos #ifndef __AARCH64EB__ 5328 1.1 christos ext v13.16b, v13.16b, v13.16b, #8 5329 1.1 christos #endif 5330 1.1 christos 5331 1.1 christos aese v2.16b, v18.16b 5332 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 0 5333 1.1 christos ld1 {v27.4s}, [x8], #16 //load rk9 5334 1.1 christos 5335 1.1 christos aese v0.16b, v19.16b 5336 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 1 5337 1.1 christos 5338 1.1 christos aese v1.16b, v19.16b 5339 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 1 5340 1.1 christos ld1 { v11.16b}, [x3] 5341 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 5342 1.1 christos rev64 v11.16b, v11.16b 5343 1.1 christos 5344 1.1 christos aese v2.16b, v19.16b 5345 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 1 5346 1.1 christos ld1 {v28.4s}, [x8], #16 //load rk10 5347 1.1 christos 5348 1.1 christos aese v3.16b, v19.16b 5349 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 1 5350 1.1 christos ld1 {v29.4s}, [x8], #16 //load rk11 5351 1.1 christos 5352 1.1 christos aese v0.16b, v20.16b 5353 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 2 5354 1.1 christos ldr q12, [x3, #32] //load h1l | h1h 5355 1.1 christos #ifndef __AARCH64EB__ 5356 1.1 christos ext v12.16b, v12.16b, v12.16b, #8 5357 1.1 christos #endif 5358 1.1 christos aese v2.16b, v20.16b 5359 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 2 5360 1.1 christos ld1 {v30.4s}, [x8], #16 //load rk12 5361 1.1 christos 5362 1.1 christos aese v3.16b, v20.16b 5363 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 2 5364 1.1 christos 5365 1.1 christos aese v0.16b, v21.16b 5366 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 3 5367 1.1 christos 5368 1.1 christos aese v1.16b, v20.16b 5369 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 2 5370 1.1 christos 5371 1.1 christos aese v3.16b, v21.16b 5372 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 3 5373 1.1 christos 5374 1.1 christos aese v0.16b, v22.16b 5375 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 4 5376 1.1 christos cmp x0, x5 //check if we have <= 4 blocks 5377 1.1 christos 5378 1.1 christos aese v2.16b, v21.16b 5379 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 3 5380 1.1 christos 5381 1.1 christos aese v1.16b, v21.16b 5382 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 3 5383 1.1 christos 5384 1.1 christos aese v3.16b, v22.16b 5385 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 4 5386 1.1 christos 5387 1.1 christos aese v2.16b, v22.16b 5388 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 4 5389 1.1 christos 5390 1.1 christos aese v1.16b, v22.16b 5391 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 4 5392 1.1 christos 5393 1.1 christos aese v3.16b, v23.16b 5394 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 5 5395 1.1 christos 5396 1.1 christos aese v0.16b, v23.16b 5397 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 5 5398 1.1 christos 5399 1.1 christos aese v1.16b, v23.16b 5400 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 5 5401 1.1 christos 5402 1.1 christos aese v2.16b, v23.16b 5403 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 5 5404 1.1 christos 5405 1.1 christos aese v0.16b, v24.16b 5406 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 6 5407 1.1 christos 5408 1.1 christos aese v3.16b, v24.16b 5409 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 6 5410 1.1 christos 5411 1.1 christos aese v1.16b, v24.16b 5412 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 6 5413 1.1 christos 5414 1.1 christos aese v2.16b, v24.16b 5415 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 6 5416 1.1 christos 5417 1.1 christos aese v0.16b, v25.16b 5418 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 7 5419 1.1 christos 5420 1.1 christos aese v1.16b, v25.16b 5421 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 7 5422 1.1 christos 5423 1.1 christos aese v3.16b, v25.16b 5424 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 7 5425 1.1 christos 5426 1.1 christos aese v0.16b, v26.16b 5427 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 8 5428 1.1 christos 5429 1.1 christos aese v2.16b, v25.16b 5430 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 7 5431 1.1 christos 5432 1.1 christos aese v3.16b, v26.16b 5433 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 8 5434 1.1 christos 5435 1.1 christos aese v1.16b, v26.16b 5436 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 8 5437 1.1 christos 5438 1.1 christos aese v0.16b, v27.16b 5439 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 9 5440 1.1 christos 5441 1.1 christos aese v2.16b, v26.16b 5442 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 8 5443 1.1 christos ld1 {v31.4s}, [x8], #16 //load rk13 5444 1.1 christos 5445 1.1 christos aese v1.16b, v27.16b 5446 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 9 5447 1.1 christos 5448 1.1 christos aese v0.16b, v28.16b 5449 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 10 5450 1.1 christos 5451 1.1 christos aese v3.16b, v27.16b 5452 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 9 5453 1.1 christos 5454 1.1 christos aese v1.16b, v28.16b 5455 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 10 5456 1.1 christos 5457 1.1 christos aese v2.16b, v27.16b 5458 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 9 5459 1.1 christos 5460 1.1 christos aese v3.16b, v28.16b 5461 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 10 5462 1.1 christos 5463 1.1 christos aese v0.16b, v29.16b 5464 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 11 5465 1.1 christos 5466 1.1 christos aese v2.16b, v28.16b 5467 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 10 5468 1.1 christos 5469 1.1 christos aese v3.16b, v29.16b 5470 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 11 5471 1.1 christos 5472 1.1 christos aese v1.16b, v29.16b 5473 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 11 5474 1.1 christos 5475 1.1 christos aese v2.16b, v29.16b 5476 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 11 5477 1.1 christos 5478 1.1 christos trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 5479 1.1 christos 5480 1.1 christos trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 5481 1.1 christos 5482 1.1 christos trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 5483 1.1 christos trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 5484 1.1 christos 5485 1.1 christos aese v1.16b, v30.16b 5486 1.1 christos aesmc v1.16b, v1.16b //AES block 1 - round 12 5487 1.1 christos 5488 1.1 christos aese v0.16b, v30.16b 5489 1.1 christos aesmc v0.16b, v0.16b //AES block 0 - round 12 5490 1.1 christos 5491 1.1 christos aese v2.16b, v30.16b 5492 1.1 christos aesmc v2.16b, v2.16b //AES block 2 - round 12 5493 1.1 christos 5494 1.1 christos aese v3.16b, v30.16b 5495 1.1 christos aesmc v3.16b, v3.16b //AES block 3 - round 12 5496 1.1 christos eor v17.16b, v17.16b, v9.16b //h4k | h3k 5497 1.1 christos 5498 1.1 christos aese v1.16b, v31.16b //AES block 1 - round 13 5499 1.1 christos 5500 1.1 christos aese v2.16b, v31.16b //AES block 2 - round 13 5501 1.1 christos eor v16.16b, v16.16b, v8.16b //h2k | h1k 5502 1.1 christos 5503 1.1 christos aese v3.16b, v31.16b //AES block 3 - round 13 5504 1.1 christos 5505 1.1 christos aese v0.16b, v31.16b //AES block 0 - round 13 5506 1.1 christos b.ge .L256_dec_tail //handle tail 5507 1.1 christos 5508 1.1 christos ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 5509 1.1 christos 5510 1.1 christos rev w9, w12 //CTR block 4 5511 1.1 christos 5512 1.1 christos eor v0.16b, v4.16b, v0.16b //AES block 0 - result 5513 1.1 christos 5514 1.1 christos eor v1.16b, v5.16b, v1.16b //AES block 1 - result 5515 1.1 christos rev64 v5.16b, v5.16b //GHASH block 1 5516 1.1 christos ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 5517 1.1 christos 5518 1.1 christos mov x7, v0.d[1] //AES block 0 - mov high 5519 1.1 christos 5520 1.1 christos mov x6, v0.d[0] //AES block 0 - mov low 5521 1.1 christos rev64 v4.16b, v4.16b //GHASH block 0 5522 1.1 christos add w12, w12, #1 //CTR block 4 5523 1.1 christos 5524 1.1 christos fmov d0, x10 //CTR block 4 5525 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4 5526 1.1 christos 5527 1.1 christos fmov v0.d[1], x9 //CTR block 4 5528 1.1 christos rev w9, w12 //CTR block 5 5529 1.1 christos add w12, w12, #1 //CTR block 5 5530 1.1 christos 5531 1.1 christos mov x19, v1.d[0] //AES block 1 - mov low 5532 1.1 christos 5533 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 5 5534 1.1 christos mov x20, v1.d[1] //AES block 1 - mov high 5535 1.1 christos eor x7, x7, x14 //AES block 0 - round 14 high 5536 1.1 christos #ifdef __AARCH64EB__ 5537 1.1 christos rev x7, x7 5538 1.1 christos #endif 5539 1.1 christos eor x6, x6, x13 //AES block 0 - round 14 low 5540 1.1 christos #ifdef __AARCH64EB__ 5541 1.1 christos rev x6, x6 5542 1.1 christos #endif 5543 1.1 christos stp x6, x7, [x2], #16 //AES block 0 - store result 5544 1.1 christos fmov d1, x10 //CTR block 5 5545 1.1 christos 5546 1.1 christos ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 5547 1.1 christos 5548 1.1 christos fmov v1.d[1], x9 //CTR block 5 5549 1.1 christos rev w9, w12 //CTR block 6 5550 1.1 christos add w12, w12, #1 //CTR block 6 5551 1.1 christos 5552 1.1 christos eor x19, x19, x13 //AES block 1 - round 14 low 5553 1.1 christos #ifdef __AARCH64EB__ 5554 1.1 christos rev x19, x19 5555 1.1 christos #endif 5556 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 6 5557 1.1 christos 5558 1.1 christos eor x20, x20, x14 //AES block 1 - round 14 high 5559 1.1 christos #ifdef __AARCH64EB__ 5560 1.1 christos rev x20, x20 5561 1.1 christos #endif 5562 1.1 christos stp x19, x20, [x2], #16 //AES block 1 - store result 5563 1.1 christos 5564 1.1 christos eor v2.16b, v6.16b, v2.16b //AES block 2 - result 5565 1.1 christos cmp x0, x5 //check if we have <= 8 blocks 5566 1.1 christos b.ge .L256_dec_prepretail //do prepretail 5567 1.1 christos 5568 1.1 christos .L256_dec_main_loop: //main loop start 5569 1.1 christos mov x21, v2.d[0] //AES block 4k+2 - mov low 5570 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5571 1.1 christos eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5572 1.1 christos 5573 1.1 christos aese v0.16b, v18.16b 5574 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5575 1.1 christos mov x22, v2.d[1] //AES block 4k+2 - mov high 5576 1.1 christos 5577 1.1 christos aese v1.16b, v18.16b 5578 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5579 1.1 christos fmov d2, x10 //CTR block 4k+6 5580 1.1 christos 5581 1.1 christos fmov v2.d[1], x9 //CTR block 4k+6 5582 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 5583 1.1 christos rev w9, w12 //CTR block 4k+7 5584 1.1 christos 5585 1.1 christos aese v0.16b, v19.16b 5586 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5587 1.1 christos mov x24, v3.d[1] //AES block 4k+3 - mov high 5588 1.1 christos 5589 1.1 christos aese v1.16b, v19.16b 5590 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5591 1.1 christos mov x23, v3.d[0] //AES block 4k+3 - mov low 5592 1.1 christos 5593 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5594 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 5595 1.1 christos fmov d3, x10 //CTR block 4k+7 5596 1.1 christos 5597 1.1 christos aese v0.16b, v20.16b 5598 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5599 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+7 5600 1.1 christos 5601 1.1 christos aese v2.16b, v18.16b 5602 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5603 1.1 christos fmov v3.d[1], x9 //CTR block 4k+7 5604 1.1 christos 5605 1.1 christos aese v1.16b, v20.16b 5606 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5607 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5608 1.1 christos 5609 1.1 christos aese v0.16b, v21.16b 5610 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5611 1.1 christos eor x22, x22, x14 //AES block 4k+2 - round 14 high 5612 1.1 christos #ifdef __AARCH64EB__ 5613 1.1 christos rev x22, x22 5614 1.1 christos #endif 5615 1.1 christos aese v2.16b, v19.16b 5616 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5617 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 5618 1.1 christos 5619 1.1 christos aese v1.16b, v21.16b 5620 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5621 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 5622 1.1 christos 5623 1.1 christos aese v3.16b, v18.16b 5624 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5625 1.1 christos eor x21, x21, x13 //AES block 4k+2 - round 14 low 5626 1.1 christos #ifdef __AARCH64EB__ 5627 1.1 christos rev x21, x21 5628 1.1 christos #endif 5629 1.1 christos aese v2.16b, v20.16b 5630 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5631 1.1 christos stp x21, x22, [x2], #16 //AES block 4k+2 - store result 5632 1.1 christos 5633 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5634 1.1 christos 5635 1.1 christos pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5636 1.1 christos 5637 1.1 christos aese v2.16b, v21.16b 5638 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5639 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 5640 1.1 christos 5641 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5642 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 14 low 5643 1.1 christos #ifdef __AARCH64EB__ 5644 1.1 christos rev x23, x23 5645 1.1 christos #endif 5646 1.1 christos pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5647 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 14 high 5648 1.1 christos #ifdef __AARCH64EB__ 5649 1.1 christos rev x24, x24 5650 1.1 christos #endif 5651 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5652 1.1 christos 5653 1.1 christos aese v2.16b, v22.16b 5654 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5655 1.1 christos 5656 1.1 christos aese v3.16b, v19.16b 5657 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5658 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 5659 1.1 christos 5660 1.1 christos aese v0.16b, v22.16b 5661 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5662 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5663 1.1 christos 5664 1.1 christos aese v2.16b, v23.16b 5665 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 5666 1.1 christos add w12, w12, #1 //CTR block 4k+7 5667 1.1 christos 5668 1.1 christos aese v3.16b, v20.16b 5669 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5670 1.1 christos mov d8, v6.d[1] //GHASH block 4k+2 - mid 5671 1.1 christos 5672 1.1 christos aese v1.16b, v22.16b 5673 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5674 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5675 1.1 christos 5676 1.1 christos pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5677 1.1 christos 5678 1.1 christos aese v3.16b, v21.16b 5679 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5680 1.1 christos eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5681 1.1 christos 5682 1.1 christos aese v1.16b, v23.16b 5683 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 5684 1.1 christos 5685 1.1 christos aese v0.16b, v23.16b 5686 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5687 1.1 christos eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5688 1.1 christos 5689 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5690 1.1 christos rev w9, w12 //CTR block 4k+8 5691 1.1 christos 5692 1.1 christos aese v1.16b, v24.16b 5693 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 5694 1.1 christos ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5695 1.1 christos 5696 1.1 christos aese v0.16b, v24.16b 5697 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 5698 1.1 christos add w12, w12, #1 //CTR block 4k+8 5699 1.1 christos 5700 1.1 christos aese v3.16b, v22.16b 5701 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5702 1.1 christos 5703 1.1 christos aese v1.16b, v25.16b 5704 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 5705 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5706 1.1 christos 5707 1.1 christos aese v0.16b, v25.16b 5708 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 5709 1.1 christos 5710 1.1 christos pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5711 1.1 christos mov d6, v7.d[1] //GHASH block 4k+3 - mid 5712 1.1 christos 5713 1.1 christos aese v3.16b, v23.16b 5714 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5715 1.1 christos 5716 1.1 christos pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5717 1.1 christos 5718 1.1 christos aese v0.16b, v26.16b 5719 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 5720 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5721 1.1 christos 5722 1.1 christos aese v3.16b, v24.16b 5723 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 5724 1.1 christos 5725 1.1 christos pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5726 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+8 5727 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 5728 1.1 christos 5729 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5730 1.1 christos 5731 1.1 christos aese v0.16b, v27.16b 5732 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 5733 1.1 christos eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 5734 1.1 christos 5735 1.1 christos aese v1.16b, v26.16b 5736 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 5737 1.1 christos 5738 1.1 christos aese v2.16b, v24.16b 5739 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 5740 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 5741 1.1 christos 5742 1.1 christos aese v0.16b, v28.16b 5743 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 5744 1.1 christos 5745 1.1 christos pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 5746 1.1 christos movi v8.8b, #0xc2 5747 1.1 christos 5748 1.1 christos aese v2.16b, v25.16b 5749 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 5750 1.1 christos eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 5751 1.1 christos 5752 1.1 christos aese v0.16b, v29.16b 5753 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 5754 1.1 christos 5755 1.1 christos aese v3.16b, v25.16b 5756 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 5757 1.1 christos shl d8, d8, #56 //mod_constant 5758 1.1 christos 5759 1.1 christos aese v2.16b, v26.16b 5760 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 5761 1.1 christos eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 5762 1.1 christos 5763 1.1 christos aese v0.16b, v30.16b 5764 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 5765 1.1 christos 5766 1.1 christos pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5767 1.1 christos eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5768 1.1 christos 5769 1.1 christos aese v1.16b, v27.16b 5770 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 5771 1.1 christos ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 5772 1.1 christos 5773 1.1 christos aese v0.16b, v31.16b //AES block 4k+4 - round 13 5774 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5775 1.1 christos 5776 1.1 christos aese v1.16b, v28.16b 5777 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 5778 1.1 christos eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 5779 1.1 christos 5780 1.1 christos aese v2.16b, v27.16b 5781 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 5782 1.1 christos ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 5783 1.1 christos 5784 1.1 christos aese v3.16b, v26.16b 5785 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 5786 1.1 christos eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 5787 1.1 christos 5788 1.1 christos aese v1.16b, v29.16b 5789 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 5790 1.1 christos stp x23, x24, [x2], #16 //AES block 4k+3 - store result 5791 1.1 christos 5792 1.1 christos aese v2.16b, v28.16b 5793 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 5794 1.1 christos eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5795 1.1 christos 5796 1.1 christos aese v3.16b, v27.16b 5797 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 5798 1.1 christos ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 5799 1.1 christos 5800 1.1 christos aese v1.16b, v30.16b 5801 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 5802 1.1 christos ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 5803 1.1 christos 5804 1.1 christos aese v2.16b, v29.16b 5805 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 5806 1.1 christos mov x7, v0.d[1] //AES block 4k+4 - mov high 5807 1.1 christos 5808 1.1 christos aese v3.16b, v28.16b 5809 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 5810 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5811 1.1 christos 5812 1.1 christos aese v1.16b, v31.16b //AES block 4k+5 - round 13 5813 1.1 christos mov x6, v0.d[0] //AES block 4k+4 - mov low 5814 1.1 christos 5815 1.1 christos aese v2.16b, v30.16b 5816 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 5817 1.1 christos fmov d0, x10 //CTR block 4k+8 5818 1.1 christos 5819 1.1 christos aese v3.16b, v29.16b 5820 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 5821 1.1 christos fmov v0.d[1], x9 //CTR block 4k+8 5822 1.1 christos 5823 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5824 1.1 christos eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 5825 1.1 christos rev w9, w12 //CTR block 4k+9 5826 1.1 christos 5827 1.1 christos aese v2.16b, v31.16b //AES block 4k+6 - round 13 5828 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+9 5829 1.1 christos cmp x0, x5 //.LOOP CONTROL 5830 1.1 christos 5831 1.1 christos add w12, w12, #1 //CTR block 4k+9 5832 1.1 christos 5833 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 14 low 5834 1.1 christos #ifdef __AARCH64EB__ 5835 1.1 christos rev x6, x6 5836 1.1 christos #endif 5837 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 14 high 5838 1.1 christos #ifdef __AARCH64EB__ 5839 1.1 christos rev x7, x7 5840 1.1 christos #endif 5841 1.1 christos mov x20, v1.d[1] //AES block 4k+5 - mov high 5842 1.1 christos eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 5843 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 5844 1.1 christos 5845 1.1 christos aese v3.16b, v30.16b 5846 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 5847 1.1 christos mov x19, v1.d[0] //AES block 4k+5 - mov low 5848 1.1 christos 5849 1.1 christos fmov d1, x10 //CTR block 4k+9 5850 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5851 1.1 christos 5852 1.1 christos fmov v1.d[1], x9 //CTR block 4k+9 5853 1.1 christos rev w9, w12 //CTR block 4k+10 5854 1.1 christos add w12, w12, #1 //CTR block 4k+10 5855 1.1 christos 5856 1.1 christos aese v3.16b, v31.16b //AES block 4k+7 - round 13 5857 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+10 5858 1.1 christos 5859 1.1 christos rev64 v5.16b, v5.16b //GHASH block 4k+5 5860 1.1 christos eor x20, x20, x14 //AES block 4k+5 - round 14 high 5861 1.1 christos #ifdef __AARCH64EB__ 5862 1.1 christos rev x20, x20 5863 1.1 christos #endif 5864 1.1 christos stp x6, x7, [x2], #16 //AES block 4k+4 - store result 5865 1.1 christos 5866 1.1 christos eor x19, x19, x13 //AES block 4k+5 - round 14 low 5867 1.1 christos #ifdef __AARCH64EB__ 5868 1.1 christos rev x19, x19 5869 1.1 christos #endif 5870 1.1 christos stp x19, x20, [x2], #16 //AES block 4k+5 - store result 5871 1.1 christos 5872 1.1 christos rev64 v4.16b, v4.16b //GHASH block 4k+4 5873 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5874 1.1 christos b.lt .L256_dec_main_loop 5875 1.1 christos 5876 1.1 christos 5877 1.1 christos .L256_dec_prepretail: //PREPRETAIL 5878 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5879 1.1 christos mov x21, v2.d[0] //AES block 4k+2 - mov low 5880 1.1 christos eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5881 1.1 christos 5882 1.1 christos aese v0.16b, v18.16b 5883 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5884 1.1 christos mov x22, v2.d[1] //AES block 4k+2 - mov high 5885 1.1 christos 5886 1.1 christos aese v1.16b, v18.16b 5887 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5888 1.1 christos fmov d2, x10 //CTR block 4k+6 5889 1.1 christos 5890 1.1 christos fmov v2.d[1], x9 //CTR block 4k+6 5891 1.1 christos rev w9, w12 //CTR block 4k+7 5892 1.1 christos eor v4.16b, v4.16b, v11.16b //PRE 1 5893 1.1 christos 5894 1.1 christos rev64 v6.16b, v6.16b //GHASH block 4k+2 5895 1.1 christos orr x9, x11, x9, lsl #32 //CTR block 4k+7 5896 1.1 christos mov x23, v3.d[0] //AES block 4k+3 - mov low 5897 1.1 christos 5898 1.1 christos aese v1.16b, v19.16b 5899 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5900 1.1 christos mov x24, v3.d[1] //AES block 4k+3 - mov high 5901 1.1 christos 5902 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5903 1.1 christos mov d8, v4.d[1] //GHASH block 4k - mid 5904 1.1 christos fmov d3, x10 //CTR block 4k+7 5905 1.1 christos 5906 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5907 1.1 christos fmov v3.d[1], x9 //CTR block 4k+7 5908 1.1 christos 5909 1.1 christos aese v2.16b, v18.16b 5910 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5911 1.1 christos mov d10, v17.d[1] //GHASH block 4k - mid 5912 1.1 christos 5913 1.1 christos aese v0.16b, v19.16b 5914 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5915 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5916 1.1 christos 5917 1.1 christos pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5918 1.1 christos 5919 1.1 christos aese v2.16b, v19.16b 5920 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5921 1.1 christos rev64 v7.16b, v7.16b //GHASH block 4k+3 5922 1.1 christos 5923 1.1 christos aese v3.16b, v18.16b 5924 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5925 1.1 christos 5926 1.1 christos pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5927 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5928 1.1 christos 5929 1.1 christos pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5930 1.1 christos 5931 1.1 christos aese v3.16b, v19.16b 5932 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5933 1.1 christos mov d4, v5.d[1] //GHASH block 4k+1 - mid 5934 1.1 christos 5935 1.1 christos aese v0.16b, v20.16b 5936 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5937 1.1 christos 5938 1.1 christos aese v1.16b, v20.16b 5939 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5940 1.1 christos eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5941 1.1 christos 5942 1.1 christos aese v2.16b, v20.16b 5943 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5944 1.1 christos 5945 1.1 christos aese v0.16b, v21.16b 5946 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5947 1.1 christos mov d8, v6.d[1] //GHASH block 4k+2 - mid 5948 1.1 christos 5949 1.1 christos aese v3.16b, v20.16b 5950 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5951 1.1 christos eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5952 1.1 christos 5953 1.1 christos pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5954 1.1 christos 5955 1.1 christos aese v0.16b, v22.16b 5956 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5957 1.1 christos 5958 1.1 christos aese v3.16b, v21.16b 5959 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5960 1.1 christos eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5961 1.1 christos 5962 1.1 christos pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5963 1.1 christos 5964 1.1 christos aese v0.16b, v23.16b 5965 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5966 1.1 christos eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5967 1.1 christos 5968 1.1 christos aese v3.16b, v22.16b 5969 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5970 1.1 christos 5971 1.1 christos pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5972 1.1 christos eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5973 1.1 christos 5974 1.1 christos pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5975 1.1 christos 5976 1.1 christos aese v3.16b, v23.16b 5977 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5978 1.1 christos ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5979 1.1 christos 5980 1.1 christos aese v2.16b, v21.16b 5981 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5982 1.1 christos 5983 1.1 christos aese v1.16b, v21.16b 5984 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5985 1.1 christos eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5986 1.1 christos 5987 1.1 christos pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5988 1.1 christos 5989 1.1 christos aese v2.16b, v22.16b 5990 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5991 1.1 christos mov d6, v7.d[1] //GHASH block 4k+3 - mid 5992 1.1 christos 5993 1.1 christos aese v1.16b, v22.16b 5994 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5995 1.1 christos 5996 1.1 christos pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5997 1.1 christos 5998 1.1 christos aese v2.16b, v23.16b 5999 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 6000 1.1 christos eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 6001 1.1 christos 6002 1.1 christos aese v1.16b, v23.16b 6003 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 6004 1.1 christos 6005 1.1 christos aese v3.16b, v24.16b 6006 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 6007 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 6008 1.1 christos 6009 1.1 christos aese v2.16b, v24.16b 6010 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 6011 1.1 christos 6012 1.1 christos aese v0.16b, v24.16b 6013 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 6014 1.1 christos movi v8.8b, #0xc2 6015 1.1 christos 6016 1.1 christos aese v1.16b, v24.16b 6017 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 6018 1.1 christos eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 6019 1.1 christos 6020 1.1 christos pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 6021 1.1 christos 6022 1.1 christos aese v3.16b, v25.16b 6023 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 6024 1.1 christos eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 6025 1.1 christos 6026 1.1 christos aese v1.16b, v25.16b 6027 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 6028 1.1 christos 6029 1.1 christos aese v0.16b, v25.16b 6030 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 6031 1.1 christos eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 6032 1.1 christos 6033 1.1 christos aese v3.16b, v26.16b 6034 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 6035 1.1 christos 6036 1.1 christos aese v2.16b, v25.16b 6037 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 6038 1.1 christos eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6039 1.1 christos 6040 1.1 christos aese v1.16b, v26.16b 6041 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 6042 1.1 christos 6043 1.1 christos aese v0.16b, v26.16b 6044 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 6045 1.1 christos shl d8, d8, #56 //mod_constant 6046 1.1 christos 6047 1.1 christos aese v2.16b, v26.16b 6048 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 6049 1.1 christos 6050 1.1 christos aese v1.16b, v27.16b 6051 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 6052 1.1 christos eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6053 1.1 christos 6054 1.1 christos pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6055 1.1 christos 6056 1.1 christos aese v2.16b, v27.16b 6057 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 6058 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6059 1.1 christos 6060 1.1 christos aese v3.16b, v27.16b 6061 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 6062 1.1 christos 6063 1.1 christos aese v0.16b, v27.16b 6064 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 6065 1.1 christos eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6066 1.1 christos 6067 1.1 christos aese v2.16b, v28.16b 6068 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 6069 1.1 christos 6070 1.1 christos aese v3.16b, v28.16b 6071 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 6072 1.1 christos 6073 1.1 christos aese v0.16b, v28.16b 6074 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 6075 1.1 christos eor x22, x22, x14 //AES block 4k+2 - round 14 high 6076 1.1 christos #ifdef __AARCH64EB__ 6077 1.1 christos rev x22, x22 6078 1.1 christos #endif 6079 1.1 christos aese v1.16b, v28.16b 6080 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 6081 1.1 christos eor x23, x23, x13 //AES block 4k+3 - round 14 low 6082 1.1 christos #ifdef __AARCH64EB__ 6083 1.1 christos rev x23, x23 6084 1.1 christos #endif 6085 1.1 christos aese v2.16b, v29.16b 6086 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 6087 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6088 1.1 christos 6089 1.1 christos aese v0.16b, v29.16b 6090 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 6091 1.1 christos add w12, w12, #1 //CTR block 4k+7 6092 1.1 christos 6093 1.1 christos aese v1.16b, v29.16b 6094 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 6095 1.1 christos eor x21, x21, x13 //AES block 4k+2 - round 14 low 6096 1.1 christos #ifdef __AARCH64EB__ 6097 1.1 christos rev x21, x21 6098 1.1 christos #endif 6099 1.1 christos 6100 1.1 christos aese v2.16b, v30.16b 6101 1.1 christos aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 6102 1.1 christos 6103 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6104 1.1 christos eor x24, x24, x14 //AES block 4k+3 - round 14 high 6105 1.1 christos #ifdef __AARCH64EB__ 6106 1.1 christos rev x24, x24 6107 1.1 christos #endif 6108 1.1 christos 6109 1.1 christos aese v3.16b, v29.16b 6110 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 6111 1.1 christos stp x21, x22, [x2], #16 //AES block 4k+2 - store result 6112 1.1 christos 6113 1.1 christos aese v1.16b, v30.16b 6114 1.1 christos aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 6115 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6116 1.1 christos 6117 1.1 christos aese v0.16b, v30.16b 6118 1.1 christos aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 6119 1.1 christos stp x23, x24, [x2], #16 //AES block 4k+3 - store result 6120 1.1 christos 6121 1.1 christos aese v3.16b, v30.16b 6122 1.1 christos aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 6123 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6124 1.1 christos 6125 1.1 christos aese v1.16b, v31.16b //AES block 4k+5 - round 13 6126 1.1 christos 6127 1.1 christos aese v0.16b, v31.16b //AES block 4k+4 - round 13 6128 1.1 christos 6129 1.1 christos aese v3.16b, v31.16b //AES block 4k+7 - round 13 6130 1.1 christos 6131 1.1 christos aese v2.16b, v31.16b //AES block 4k+6 - round 13 6132 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6133 1.1 christos .L256_dec_tail: //TAIL 6134 1.1 christos 6135 1.1 christos sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 6136 1.1 christos ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 6137 1.1 christos 6138 1.1 christos eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 6139 1.1 christos 6140 1.1 christos mov x6, v0.d[0] //AES block 4k+4 - mov low 6141 1.1 christos 6142 1.1 christos mov x7, v0.d[1] //AES block 4k+4 - mov high 6143 1.1 christos ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 6144 1.1 christos 6145 1.1 christos cmp x5, #48 6146 1.1 christos 6147 1.1 christos eor x6, x6, x13 //AES block 4k+4 - round 14 low 6148 1.1 christos #ifdef __AARCH64EB__ 6149 1.1 christos rev x6, x6 6150 1.1 christos #endif 6151 1.1 christos 6152 1.1 christos eor x7, x7, x14 //AES block 4k+4 - round 14 high 6153 1.1 christos #ifdef __AARCH64EB__ 6154 1.1 christos rev x7, x7 6155 1.1 christos #endif 6156 1.1 christos b.gt .L256_dec_blocks_more_than_3 6157 1.1 christos 6158 1.1 christos sub w12, w12, #1 6159 1.1 christos mov v3.16b, v2.16b 6160 1.1 christos movi v10.8b, #0 6161 1.1 christos 6162 1.1 christos movi v11.8b, #0 6163 1.1 christos cmp x5, #32 6164 1.1 christos 6165 1.1 christos movi v9.8b, #0 6166 1.1 christos mov v2.16b, v1.16b 6167 1.1 christos b.gt .L256_dec_blocks_more_than_2 6168 1.1 christos 6169 1.1 christos sub w12, w12, #1 6170 1.1 christos 6171 1.1 christos mov v3.16b, v1.16b 6172 1.1 christos cmp x5, #16 6173 1.1 christos b.gt .L256_dec_blocks_more_than_1 6174 1.1 christos 6175 1.1 christos sub w12, w12, #1 6176 1.1 christos b .L256_dec_blocks_less_than_1 6177 1.1 christos .L256_dec_blocks_more_than_3: //blocks left > 3 6178 1.1 christos rev64 v4.16b, v5.16b //GHASH final-3 block 6179 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 6180 1.1 christos 6181 1.1 christos stp x6, x7, [x2], #16 //AES final-3 block - store result 6182 1.1 christos 6183 1.1 christos mov d10, v17.d[1] //GHASH final-3 block - mid 6184 1.1 christos 6185 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 6186 1.1 christos 6187 1.1 christos eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 6188 1.1 christos 6189 1.1 christos mov d22, v4.d[1] //GHASH final-3 block - mid 6190 1.1 christos 6191 1.1 christos mov x6, v0.d[0] //AES final-2 block - mov low 6192 1.1 christos 6193 1.1 christos mov x7, v0.d[1] //AES final-2 block - mov high 6194 1.1 christos 6195 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 6196 1.1 christos 6197 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 6198 1.1 christos 6199 1.1 christos pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 6200 1.1 christos 6201 1.1 christos pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 6202 1.1 christos eor x6, x6, x13 //AES final-2 block - round 14 low 6203 1.1 christos #ifdef __AARCH64EB__ 6204 1.1 christos rev x6, x6 6205 1.1 christos #endif 6206 1.1 christos 6207 1.1 christos pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 6208 1.1 christos eor x7, x7, x14 //AES final-2 block - round 14 high 6209 1.1 christos #ifdef __AARCH64EB__ 6210 1.1 christos rev x7, x7 6211 1.1 christos #endif 6212 1.1 christos .L256_dec_blocks_more_than_2: //blocks left > 2 6213 1.1 christos 6214 1.1 christos rev64 v4.16b, v5.16b //GHASH final-2 block 6215 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 6216 1.1 christos 6217 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 6218 1.1 christos stp x6, x7, [x2], #16 //AES final-2 block - store result 6219 1.1 christos 6220 1.1 christos eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 6221 1.1 christos 6222 1.1 christos mov d22, v4.d[1] //GHASH final-2 block - mid 6223 1.1 christos 6224 1.1 christos pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 6225 1.1 christos 6226 1.1 christos pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 6227 1.1 christos 6228 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 6229 1.1 christos mov x6, v0.d[0] //AES final-1 block - mov low 6230 1.1 christos 6231 1.1 christos mov x7, v0.d[1] //AES final-1 block - mov high 6232 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 6233 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 6234 1.1 christos 6235 1.1 christos pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 6236 1.1 christos 6237 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 6238 1.1 christos eor x6, x6, x13 //AES final-1 block - round 14 low 6239 1.1 christos #ifdef __AARCH64EB__ 6240 1.1 christos rev x6, x6 6241 1.1 christos #endif 6242 1.1 christos 6243 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 6244 1.1 christos eor x7, x7, x14 //AES final-1 block - round 14 high 6245 1.1 christos #ifdef __AARCH64EB__ 6246 1.1 christos rev x7, x7 6247 1.1 christos #endif 6248 1.1 christos .L256_dec_blocks_more_than_1: //blocks left > 1 6249 1.1 christos 6250 1.1 christos stp x6, x7, [x2], #16 //AES final-1 block - store result 6251 1.1 christos rev64 v4.16b, v5.16b //GHASH final-1 block 6252 1.1 christos 6253 1.1 christos ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 6254 1.1 christos 6255 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 6256 1.1 christos movi v8.8b, #0 //suppress further partial tag feed in 6257 1.1 christos 6258 1.1 christos mov d22, v4.d[1] //GHASH final-1 block - mid 6259 1.1 christos 6260 1.1 christos eor v0.16b, v5.16b, v3.16b //AES final block - result 6261 1.1 christos 6262 1.1 christos pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 6263 1.1 christos 6264 1.1 christos eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 6265 1.1 christos 6266 1.1 christos pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 6267 1.1 christos mov x6, v0.d[0] //AES final block - mov low 6268 1.1 christos 6269 1.1 christos ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 6270 1.1 christos 6271 1.1 christos mov x7, v0.d[1] //AES final block - mov high 6272 1.1 christos 6273 1.1 christos pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 6274 1.1 christos eor x6, x6, x13 //AES final block - round 14 low 6275 1.1 christos #ifdef __AARCH64EB__ 6276 1.1 christos rev x6, x6 6277 1.1 christos #endif 6278 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 6279 1.1 christos 6280 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 6281 1.1 christos 6282 1.1 christos eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 6283 1.1 christos eor x7, x7, x14 //AES final block - round 14 high 6284 1.1 christos #ifdef __AARCH64EB__ 6285 1.1 christos rev x7, x7 6286 1.1 christos #endif 6287 1.1 christos .L256_dec_blocks_less_than_1: //blocks left <= 1 6288 1.1 christos 6289 1.1 christos and x1, x1, #127 //bit_length %= 128 6290 1.1 christos mvn x14, xzr //rk14_h = 0xffffffffffffffff 6291 1.1 christos 6292 1.1 christos sub x1, x1, #128 //bit_length -= 128 6293 1.1 christos mvn x13, xzr //rk14_l = 0xffffffffffffffff 6294 1.1 christos 6295 1.1 christos ldp x4, x5, [x2] //load existing bytes we need to not overwrite 6296 1.1 christos neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 6297 1.1 christos 6298 1.1 christos and x1, x1, #127 //bit_length %= 128 6299 1.1 christos 6300 1.1 christos lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 6301 1.1 christos cmp x1, #64 6302 1.1 christos 6303 1.1 christos csel x9, x13, x14, lt 6304 1.1 christos csel x10, x14, xzr, lt 6305 1.1 christos 6306 1.1 christos fmov d0, x9 //ctr0b is mask for last block 6307 1.1 christos and x6, x6, x9 6308 1.1 christos 6309 1.1 christos mov v0.d[1], x10 6310 1.1 christos bic x4, x4, x9 //mask out low existing bytes 6311 1.1 christos 6312 1.1 christos #ifndef __AARCH64EB__ 6313 1.1 christos rev w9, w12 6314 1.1 christos #else 6315 1.1 christos mov w9, w12 6316 1.1 christos #endif 6317 1.1 christos 6318 1.1 christos bic x5, x5, x10 //mask out high existing bytes 6319 1.1 christos 6320 1.1 christos orr x6, x6, x4 6321 1.1 christos 6322 1.1 christos and x7, x7, x10 6323 1.1 christos 6324 1.1 christos orr x7, x7, x5 6325 1.1 christos 6326 1.1 christos and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 6327 1.1 christos 6328 1.1 christos rev64 v4.16b, v5.16b //GHASH final block 6329 1.1 christos 6330 1.1 christos eor v4.16b, v4.16b, v8.16b //feed in partial tag 6331 1.1 christos 6332 1.1 christos pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 6333 1.1 christos 6334 1.1 christos mov d8, v4.d[1] //GHASH final block - mid 6335 1.1 christos 6336 1.1 christos eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 6337 1.1 christos 6338 1.1 christos pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 6339 1.1 christos 6340 1.1 christos pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 6341 1.1 christos 6342 1.1 christos eor v9.16b, v9.16b, v20.16b //GHASH final block - high 6343 1.1 christos 6344 1.1 christos eor v11.16b, v11.16b, v21.16b //GHASH final block - low 6345 1.1 christos 6346 1.1 christos eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 6347 1.1 christos movi v8.8b, #0xc2 6348 1.1 christos 6349 1.1 christos eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6350 1.1 christos 6351 1.1 christos shl d8, d8, #56 //mod_constant 6352 1.1 christos 6353 1.1 christos eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6354 1.1 christos 6355 1.1 christos pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6356 1.1 christos 6357 1.1 christos ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6358 1.1 christos 6359 1.1 christos eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6360 1.1 christos 6361 1.1 christos eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6362 1.1 christos 6363 1.1 christos pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6364 1.1 christos 6365 1.1 christos ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6366 1.1 christos 6367 1.1 christos eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6368 1.1 christos 6369 1.1 christos stp x6, x7, [x2] 6370 1.1 christos 6371 1.1 christos str w9, [x16, #12] //store the updated counter 6372 1.1 christos 6373 1.1 christos eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6374 1.1 christos ext v11.16b, v11.16b, v11.16b, #8 6375 1.1 christos rev64 v11.16b, v11.16b 6376 1.1 christos mov x0, x15 6377 1.1 christos st1 { v11.16b }, [x3] 6378 1.1 christos 6379 1.1 christos ldp x21, x22, [sp, #16] 6380 1.1 christos ldp x23, x24, [sp, #32] 6381 1.1 christos ldp d8, d9, [sp, #48] 6382 1.1 christos ldp d10, d11, [sp, #64] 6383 1.1 christos ldp d12, d13, [sp, #80] 6384 1.1 christos ldp d14, d15, [sp, #96] 6385 1.1 christos ldp x19, x20, [sp], #112 6386 1.1 christos ret 6387 1.1 christos 6388 1.1 christos .L256_dec_ret: 6389 1.1 christos mov w0, #0x0 6390 1.1 christos ret 6391 1.1 christos .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6392 1.2 christos .section .rodata 6393 1.1 christos .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 6394 1.1 christos .align 2 6395 1.1 christos .align 2 6396 1.1 christos #endif 6397