Home | History | Annotate | Line # | Download | only in arm
      1 #include "arm_asm.h"
      2 #include "arm_arch.h"
      3 
      4 #if __ARM_MAX_ARCH__>=8
      5 .fpu	neon
      6 #ifdef __thumb2__
      7 .syntax	unified
      8 .thumb
      9 # define INST(a,b,c,d)   c,0xef,a,b
     10 #else
     11 .code	32
     12 # define INST(a,b,c,d)   a,b,c,0xf2
     13 #endif
     14 
     15 .text
     16 .globl	aes_gcm_enc_128_kernel
     17 .type	aes_gcm_enc_128_kernel,%function
     18 .align	4
     19 aes_gcm_enc_128_kernel:
     20 	cbz	r1, .L128_enc_ret
     21 	stp	r19, r20, [sp, #-112]!
     22 	mov	r16, r4
     23 	mov	r8, r5
     24 	stp	r21, r22, [sp, #16]
     25 	stp	r23, r24, [sp, #32]
     26 	stp	d8, d9, [sp, #48]
     27 	stp	d10, d11, [sp, #64]
     28 	stp	d12, d13, [sp, #80]
     29 	stp	d14, d15, [sp, #96]
     30 
     31 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
     32 #ifdef __ARMEB__
     33 	rev	r10, r10
     34 	rev	r11, r11
     35 #endif
     36 	ldp	r13, r14, [r8, #160]                     @ load rk10
     37 #ifdef __ARMEB__
     38 	ror	r13, r13, #32
     39 	ror	r14, r14, #32
     40 #endif
     41 	ld1	{v11.16b}, [r3]
     42 	ext	v11.16b, v11.16b, v11.16b, #8
     43 	rev64	v11.16b, v11.16b
     44 	lsr	r5, r1, #3              @ byte_len
     45 	mov	r15, r5
     46 
     47 	ld1	{v18.4s}, [r8], #16								  @ load rk0
     48 	add	r4, r0, r1, lsr #3   @ end_input_ptr
     49 	sub	r5, r5, #1      @ byte_len - 1
     50 
     51 	lsr	r12, r11, #32
     52 	ldr	q15, [r3, #112]                        @ load h4l | h4h
     53 #ifndef __ARMEB__
     54 	ext	v15.16b, v15.16b, v15.16b, #8
     55 #endif
     56 	fmov	d1, r10                               @ CTR block 1
     57 	rev	r12, r12                                @ rev_ctr32
     58 
     59 	add	r12, r12, #1                            @ increment rev_ctr32
     60 	orr	r11, r11, r11
     61 	ld1	{v19.4s}, [r8], #16								  @ load rk1
     62 
     63 	rev	r9, r12                                 @ CTR block 1
     64 	add	r12, r12, #1                            @ CTR block 1
     65 	fmov	d3, r10                               @ CTR block 3
     66 
     67 	orr	r9, r11, r9, lsl #32            @ CTR block 1
     68 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
     69 
     70 	fmov	v1.d[1], r9                               @ CTR block 1
     71 	rev	r9, r12                                 @ CTR block 2
     72 
     73 	fmov	d2, r10                               @ CTR block 2
     74 	orr	r9, r11, r9, lsl #32            @ CTR block 2
     75 	add	r12, r12, #1                            @ CTR block 2
     76 
     77 	fmov	v2.d[1], r9                               @ CTR block 2
     78 	rev	r9, r12                                 @ CTR block 3
     79 
     80 	orr	r9, r11, r9, lsl #32            @ CTR block 3
     81 	ld1	{v20.4s}, [r8], #16								  @ load rk2
     82 
     83 	add	r12, r12, #1                            @ CTR block 3
     84 	fmov	v3.d[1], r9                               @ CTR block 3
     85 
     86 	ldr	q14, [r3, #80]                         @ load h3l | h3h
     87 #ifndef __ARMEB__
     88 	ext	v14.16b, v14.16b, v14.16b, #8
     89 #endif
     90 	aese	q1, v18.16b
     91 	aesmc	q1, q1          @ AES block 1 - round 0
     92 	ld1	{v21.4s}, [r8], #16								  @ load rk3
     93 
     94 	aese	q2, v18.16b
     95 	aesmc	q2, q2          @ AES block 2 - round 0
     96 	ldr	q12, [r3, #32]                         @ load h1l | h1h
     97 #ifndef __ARMEB__
     98 	ext	v12.16b, v12.16b, v12.16b, #8
     99 #endif
    100 
    101 	aese	q0, v18.16b
    102 	aesmc	q0, q0          @ AES block 0 - round 0
    103 	ld1	{v22.4s}, [r8], #16								  @ load rk4
    104 
    105 	aese	q3, v18.16b
    106 	aesmc	q3, q3          @ AES block 3 - round 0
    107 	ld1	{v23.4s}, [r8], #16								  @ load rk5
    108 
    109 	aese	q2, v19.16b
    110 	aesmc	q2, q2          @ AES block 2 - round 1
    111 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
    112 
    113 	aese	q0, v19.16b
    114 	aesmc	q0, q0          @ AES block 0 - round 1
    115 	ld1	{v24.4s}, [r8], #16								  @ load rk6
    116 
    117 	aese	q1, v19.16b
    118 	aesmc	q1, q1          @ AES block 1 - round 1
    119 	ld1	{v25.4s}, [r8], #16								  @ load rk7
    120 
    121 	aese	q3, v19.16b
    122 	aesmc	q3, q3          @ AES block 3 - round 1
    123 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
    124 
    125 	aese	q0, v20.16b
    126 	aesmc	q0, q0          @ AES block 0 - round 2
    127 	ld1	{v26.4s}, [r8], #16								  @ load rk8
    128 
    129 	aese	q1, v20.16b
    130 	aesmc	q1, q1          @ AES block 1 - round 2
    131 	ldr	q13, [r3, #64]                         @ load h2l | h2h
    132 #ifndef __ARMEB__
    133 	ext	v13.16b, v13.16b, v13.16b, #8
    134 #endif
    135 
    136 	aese	q3, v20.16b
    137 	aesmc	q3, q3          @ AES block 3 - round 2
    138 
    139 	aese	q2, v20.16b
    140 	aesmc	q2, q2          @ AES block 2 - round 2
    141 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
    142 
    143 	aese	q0, v21.16b
    144 	aesmc	q0, q0          @ AES block 0 - round 3
    145 
    146 	aese	q1, v21.16b
    147 	aesmc	q1, q1          @ AES block 1 - round 3
    148 
    149 	aese	q2, v21.16b
    150 	aesmc	q2, q2          @ AES block 2 - round 3
    151 	ld1	{v27.4s}, [r8], #16								  @ load rk9
    152 
    153 	aese	q3, v21.16b
    154 	aesmc	q3, q3          @ AES block 3 - round 3
    155 
    156 	and	r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
    157 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
    158 
    159 	aese	q3, v22.16b
    160 	aesmc	q3, q3          @ AES block 3 - round 4
    161 	add	r5, r5, r0
    162 
    163 	aese	q2, v22.16b
    164 	aesmc	q2, q2          @ AES block 2 - round 4
    165 	cmp	r0, r5                   @ check if we have <= 4 blocks
    166 
    167 	aese	q0, v22.16b
    168 	aesmc	q0, q0          @ AES block 0 - round 4
    169 
    170 	aese	q3, v23.16b
    171 	aesmc	q3, q3          @ AES block 3 - round 5
    172 
    173 	aese	q2, v23.16b
    174 	aesmc	q2, q2          @ AES block 2 - round 5
    175 
    176 	aese	q0, v23.16b
    177 	aesmc	q0, q0          @ AES block 0 - round 5
    178 
    179 	aese	q3, v24.16b
    180 	aesmc	q3, q3          @ AES block 3 - round 6
    181 
    182 	aese	q1, v22.16b
    183 	aesmc	q1, q1          @ AES block 1 - round 4
    184 
    185 	aese	q2, v24.16b
    186 	aesmc	q2, q2          @ AES block 2 - round 6
    187 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
    188 
    189 	aese	q0, v24.16b
    190 	aesmc	q0, q0          @ AES block 0 - round 6
    191 
    192 	aese	q1, v23.16b
    193 	aesmc	q1, q1          @ AES block 1 - round 5
    194 
    195 	aese	q3, v25.16b
    196 	aesmc	q3, q3          @ AES block 3 - round 7
    197 
    198 	aese	q0, v25.16b
    199 	aesmc	q0, q0          @ AES block 0 - round 7
    200 
    201 	aese	q1, v24.16b
    202 	aesmc	q1, q1          @ AES block 1 - round 6
    203 
    204 	aese	q2, v25.16b
    205 	aesmc	q2, q2          @ AES block 2 - round 7
    206 
    207 	aese	q0, v26.16b
    208 	aesmc	q0, q0          @ AES block 0 - round 8
    209 
    210 	aese	q1, v25.16b
    211 	aesmc	q1, q1          @ AES block 1 - round 7
    212 
    213 	aese	q2, v26.16b
    214 	aesmc	q2, q2          @ AES block 2 - round 8
    215 
    216 	aese	q3, v26.16b
    217 	aesmc	q3, q3          @ AES block 3 - round 8
    218 
    219 	aese	q1, v26.16b
    220 	aesmc	q1, q1          @ AES block 1 - round 8
    221 
    222 	aese	q2, v27.16b                                      @ AES block 2 - round 9
    223 
    224 	aese	q0, v27.16b                                      @ AES block 0 - round 9
    225 
    226 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
    227 
    228 	aese	q1, v27.16b                                      @ AES block 1 - round 9
    229 
    230 	aese	q3, v27.16b                                      @ AES block 3 - round 9
    231 	bge	.L128_enc_tail                                    @ handle tail
    232 
    233 	ldp	r6, r7, [r0, #0]            @ AES block 0 - load plaintext
    234 #ifdef __ARMEB__
    235 	rev	r6, r6
    236 	rev	r7, r7
    237 #endif
    238 	ldp	r21, r22, [r0, #32]           @ AES block 2 - load plaintext
    239 #ifdef __ARMEB__
    240 	rev	r21, r21
    241 	rev	r22, r22
    242 #endif
    243 	ldp	r19, r20, [r0, #16]           @ AES block 1 - load plaintext
    244 #ifdef __ARMEB__
    245 	rev	r19, r19
    246 	rev	r20, r20
    247 #endif
    248 	ldp	r23, r24, [r0, #48]           @ AES block 3 - load plaintext
    249 #ifdef __ARMEB__
    250 	rev	r23, r23
    251 	rev	r24, r24
    252 #endif
    253 	eor	r6, r6, r13                     @ AES block 0 - round 10 low
    254 	eor	r7, r7, r14                     @ AES block 0 - round 10 high
    255 
    256 	eor	r21, r21, r13                     @ AES block 2 - round 10 low
    257 	fmov	d4, r6                               @ AES block 0 - mov low
    258 
    259 	eor	r19, r19, r13                     @ AES block 1 - round 10 low
    260 	eor	r22, r22, r14                     @ AES block 2 - round 10 high
    261 	fmov	v4.d[1], r7                           @ AES block 0 - mov high
    262 
    263 	fmov	d5, r19                               @ AES block 1 - mov low
    264 	eor	r20, r20, r14                     @ AES block 1 - round 10 high
    265 
    266 	eor	r23, r23, r13                     @ AES block 3 - round 10 low
    267 	fmov	v5.d[1], r20                           @ AES block 1 - mov high
    268 
    269 	fmov	d6, r21                               @ AES block 2 - mov low
    270 	eor	r24, r24, r14                     @ AES block 3 - round 10 high
    271 	rev	r9, r12                                 @ CTR block 4
    272 
    273 	fmov	v6.d[1], r22                           @ AES block 2 - mov high
    274 	orr	r9, r11, r9, lsl #32            @ CTR block 4
    275 
    276 	eor	q4, q4, q0                          @ AES block 0 - result
    277 	fmov	d0, r10                               @ CTR block 4
    278 	add	r12, r12, #1                            @ CTR block 4
    279 
    280 	fmov	v0.d[1], r9                               @ CTR block 4
    281 	rev	r9, r12                                 @ CTR block 5
    282 
    283 	eor	q5, q5, q1                          @ AES block 1 - result
    284 	fmov	d1, r10                               @ CTR block 5
    285 	orr	r9, r11, r9, lsl #32            @ CTR block 5
    286 
    287 	add	r12, r12, #1                            @ CTR block 5
    288 	add	r0, r0, #64                       @ AES input_ptr update
    289 	fmov	v1.d[1], r9                               @ CTR block 5
    290 
    291 	fmov	d7, r23                               @ AES block 3 - mov low
    292 	rev	r9, r12                                 @ CTR block 6
    293 	st1	{ q4}, [r2], #16                     @ AES block 0 - store result
    294 
    295 	fmov	v7.d[1], r24                           @ AES block 3 - mov high
    296 	orr	r9, r11, r9, lsl #32            @ CTR block 6
    297 
    298 	add	r12, r12, #1                            @ CTR block 6
    299 	eor	q6, q6, q2                          @ AES block 2 - result
    300 	st1	{ q5}, [r2], #16                     @ AES block 1 - store result
    301 
    302 	fmov	d2, r10                               @ CTR block 6
    303 	cmp	r0, r5                   @ check if we have <= 8 blocks
    304 
    305 	fmov	v2.d[1], r9                               @ CTR block 6
    306 	rev	r9, r12                                 @ CTR block 7
    307 	st1	{ q6}, [r2], #16                     @ AES block 2 - store result
    308 
    309 	orr	r9, r11, r9, lsl #32            @ CTR block 7
    310 
    311 	eor	q7, q7, q3                          @ AES block 3 - result
    312 	st1	{ q7}, [r2], #16                     @ AES block 3 - store result
    313 	bge	.L128_enc_prepretail                              @ do prepretail
    314 
    315 .L128_enc_main_loop:@ main loop start
    316 	ldp	r23, r24, [r0, #48]           @ AES block 4k+3 - load plaintext
    317 #ifdef __ARMEB__
    318 	rev	r23, r23
    319 	rev	r24, r24
    320 #endif
    321 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
    322 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
    323 
    324 	aese	q2, v18.16b
    325 	aesmc	q2, q2          @ AES block 4k+6 - round 0
    326 	fmov	d3, r10                               @ CTR block 4k+3
    327 
    328 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
    329 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
    330 
    331 	aese	q1, v18.16b
    332 	aesmc	q1, q1          @ AES block 4k+5 - round 0
    333 	add	r12, r12, #1                            @ CTR block 4k+3
    334 	fmov	v3.d[1], r9                               @ CTR block 4k+3
    335 
    336 	aese	q0, v18.16b
    337 	aesmc	q0, q0          @ AES block 4k+4 - round 0
    338 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
    339 
    340 	aese	q2, v19.16b
    341 	aesmc	q2, q2          @ AES block 4k+6 - round 1
    342 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
    343 
    344 	aese	q1, v19.16b
    345 	aesmc	q1, q1          @ AES block 4k+5 - round 1
    346 	eor	q4, q4, v11.16b                           @ PRE 1
    347 
    348 	aese	q3, v18.16b
    349 	aesmc	q3, q3          @ AES block 4k+7 - round 0
    350 	eor	r24, r24, r14                     @ AES block 4k+3 - round 10 high
    351 
    352 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
    353 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
    354 	ldp	r6, r7, [r0, #0]            @ AES block 4k+4 - load plaintext
    355 #ifdef __ARMEB__
    356 	rev	r6, r6
    357 	rev	r7, r7
    358 #endif
    359 	aese	q0, v19.16b
    360 	aesmc	q0, q0          @ AES block 4k+4 - round 1
    361 	rev	r9, r12                                 @ CTR block 4k+8
    362 
    363 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
    364 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
    365 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
    366 
    367 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
    368 	add	r12, r12, #1                            @ CTR block 4k+8
    369 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
    370 
    371 	aese	q0, v20.16b
    372 	aesmc	q0, q0          @ AES block 4k+4 - round 2
    373 
    374 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
    375 	eor	q8, q8, q4                          @ GHASH block 4k - mid
    376 
    377 	aese	q1, v20.16b
    378 	aesmc	q1, q1          @ AES block 4k+5 - round 2
    379 
    380 	aese	q0, v21.16b
    381 	aesmc	q0, q0          @ AES block 4k+4 - round 3
    382 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
    383 
    384 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
    385 
    386 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
    387 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
    388 
    389 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
    390 
    391 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
    392 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
    393 
    394 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
    395 	eor	r7, r7, r14                     @ AES block 4k+4 - round 10 high
    396 
    397 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
    398 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
    399 
    400 	aese	q3, v19.16b
    401 	aesmc	q3, q3          @ AES block 4k+7 - round 1
    402 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
    403 
    404 	aese	q2, v20.16b
    405 	aesmc	q2, q2          @ AES block 4k+6 - round 2
    406 	eor	r6, r6, r13                     @ AES block 4k+4 - round 10 low
    407 
    408 	aese	q1, v21.16b
    409 	aesmc	q1, q1          @ AES block 4k+5 - round 3
    410 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
    411 
    412 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
    413 
    414 	aese	q2, v21.16b
    415 	aesmc	q2, q2          @ AES block 4k+6 - round 3
    416 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
    417 
    418 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
    419 
    420 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
    421 	movi	q8, #0xc2
    422 
    423 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
    424 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
    425 
    426 	aese	q1, v22.16b
    427 	aesmc	q1, q1          @ AES block 4k+5 - round 4
    428 
    429 	aese	q3, v20.16b
    430 	aesmc	q3, q3          @ AES block 4k+7 - round 2
    431 	shl	d8, d8, #56               @ mod_constant
    432 
    433 	aese	q0, v22.16b
    434 	aesmc	q0, q0          @ AES block 4k+4 - round 4
    435 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
    436 
    437 	aese	q1, v23.16b
    438 	aesmc	q1, q1          @ AES block 4k+5 - round 5
    439 	ldp	r19, r20, [r0, #16]           @ AES block 4k+5 - load plaintext
    440 #ifdef __ARMEB__
    441 	rev	r19, r19
    442 	rev	r20, r20
    443 #endif
    444 	aese	q3, v21.16b
    445 	aesmc	q3, q3          @ AES block 4k+7 - round 3
    446 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
    447 
    448 	aese	q0, v23.16b
    449 	aesmc	q0, q0          @ AES block 4k+4 - round 5
    450 	ldp	r21, r22, [r0, #32]           @ AES block 4k+6 - load plaintext
    451 #ifdef __ARMEB__
    452 	rev	r21, r21
    453 	rev	r22, r22
    454 #endif
    455 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
    456 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
    457 
    458 	aese	q2, v22.16b
    459 	aesmc	q2, q2          @ AES block 4k+6 - round 4
    460 	eor	r19, r19, r13                     @ AES block 4k+5 - round 10 low
    461 
    462 	aese	q3, v22.16b
    463 	aesmc	q3, q3          @ AES block 4k+7 - round 4
    464 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
    465 
    466 	aese	q1, v24.16b
    467 	aesmc	q1, q1          @ AES block 4k+5 - round 6
    468 	eor	r23, r23, r13                     @ AES block 4k+3 - round 10 low
    469 
    470 	aese	q2, v23.16b
    471 	aesmc	q2, q2          @ AES block 4k+6 - round 5
    472 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
    473 
    474 	fmov	d4, r6                               @ AES block 4k+4 - mov low
    475 	aese	q0, v24.16b
    476 	aesmc	q0, q0          @ AES block 4k+4 - round 6
    477 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
    478 
    479 	add	r0, r0, #64                       @ AES input_ptr update
    480 	fmov	d7, r23                               @ AES block 4k+3 - mov low
    481 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
    482 
    483 	aese	q3, v23.16b
    484 	aesmc	q3, q3          @ AES block 4k+7 - round 5
    485 	fmov	d5, r19                               @ AES block 4k+5 - mov low
    486 
    487 	aese	q0, v25.16b
    488 	aesmc	q0, q0          @ AES block 4k+4 - round 7
    489 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
    490 
    491 	aese	q2, v24.16b
    492 	aesmc	q2, q2          @ AES block 4k+6 - round 6
    493 	eor	r20, r20, r14                     @ AES block 4k+5 - round 10 high
    494 
    495 	aese	q1, v25.16b
    496 	aesmc	q1, q1          @ AES block 4k+5 - round 7
    497 	fmov	v5.d[1], r20                           @ AES block 4k+5 - mov high
    498 
    499 	aese	q0, v26.16b
    500 	aesmc	q0, q0          @ AES block 4k+4 - round 8
    501 	fmov	v7.d[1], r24                           @ AES block 4k+3 - mov high
    502 
    503 	aese	q3, v24.16b
    504 	aesmc	q3, q3          @ AES block 4k+7 - round 6
    505 	cmp	r0, r5                   @ .LOOP CONTROL
    506 
    507 	aese	q1, v26.16b
    508 	aesmc	q1, q1          @ AES block 4k+5 - round 8
    509 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
    510 
    511 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
    512 	eor	r21, r21, r13                     @ AES block 4k+6 - round 10 low
    513 	eor	r22, r22, r14                     @ AES block 4k+6 - round 10 high
    514 
    515 	aese	q3, v25.16b
    516 	aesmc	q3, q3          @ AES block 4k+7 - round 7
    517 	fmov	d6, r21                               @ AES block 4k+6 - mov low
    518 
    519 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
    520 	fmov	v6.d[1], r22                           @ AES block 4k+6 - mov high
    521 
    522 	aese	q2, v25.16b
    523 	aesmc	q2, q2          @ AES block 4k+6 - round 7
    524 	eor	q4, q4, q0                          @ AES block 4k+4 - result
    525 
    526 	fmov	d0, r10                               @ CTR block 4k+8
    527 	aese	q3, v26.16b
    528 	aesmc	q3, q3          @ AES block 4k+7 - round 8
    529 
    530 	fmov	v0.d[1], r9                               @ CTR block 4k+8
    531 	rev	r9, r12                                 @ CTR block 4k+9
    532 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
    533 
    534 	aese	q2, v26.16b
    535 	aesmc	q2, q2          @ AES block 4k+6 - round 8
    536 	eor	q5, q5, q1                          @ AES block 4k+5 - result
    537 
    538 	add	r12, r12, #1                            @ CTR block 4k+9
    539 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
    540 	fmov	d1, r10                               @ CTR block 4k+9
    541 
    542 	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
    543 	fmov	v1.d[1], r9                               @ CTR block 4k+9
    544 	rev	r9, r12                                 @ CTR block 4k+10
    545 
    546 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
    547 	st1	{ q4}, [r2], #16                     @ AES block 4k+4 - store result
    548 	eor	q6, q6, q2                          @ AES block 4k+6 - result
    549 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
    550 
    551 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
    552 	add	r12, r12, #1                            @ CTR block 4k+10
    553 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
    554 	fmov	d2, r10                               @ CTR block 4k+10
    555 
    556 	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
    557 	st1	{ q5}, [r2], #16                     @ AES block 4k+5 - store result
    558 
    559 	fmov	v2.d[1], r9                               @ CTR block 4k+10
    560 	st1	{ q6}, [r2], #16                     @ AES block 4k+6 - store result
    561 	rev	r9, r12                                 @ CTR block 4k+11
    562 
    563 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+11
    564 	eor	q7, q7, q3                          @ AES block 4k+3 - result
    565 
    566 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
    567 	st1	{ q7}, [r2], #16                     @ AES block 4k+3 - store result
    568 	blt	.L128_enc_main_loop
    569 
    570 .L128_enc_prepretail:@ PREPRETAIL
    571 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
    572 	fmov	d3, r10                               @ CTR block 4k+3
    573 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
    574 
    575 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
    576 	add	r12, r12, #1                            @ CTR block 4k+3
    577 	fmov	v3.d[1], r9                               @ CTR block 4k+3
    578 
    579 	aese	q1, v18.16b
    580 	aesmc	q1, q1          @ AES block 4k+5 - round 0
    581 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
    582 
    583 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
    584 
    585 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
    586 	eor	q4, q4, v11.16b                           @ PRE 1
    587 
    588 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
    589 
    590 	aese	q3, v18.16b
    591 	aesmc	q3, q3          @ AES block 4k+7 - round 0
    592 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
    593 
    594 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
    595 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
    596 
    597 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
    598 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
    599 
    600 	aese	q1, v19.16b
    601 	aesmc	q1, q1          @ AES block 4k+5 - round 1
    602 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
    603 
    604 	eor	q8, q8, q4                          @ GHASH block 4k - mid
    605 
    606 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
    607 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
    608 
    609 	aese	q3, v19.16b
    610 	aesmc	q3, q3          @ AES block 4k+7 - round 1
    611 
    612 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
    613 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
    614 
    615 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
    616 
    617 	aese	q0, v18.16b
    618 	aesmc	q0, q0          @ AES block 4k+4 - round 0
    619 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
    620 
    621 	aese	q2, v18.16b
    622 	aesmc	q2, q2          @ AES block 4k+6 - round 0
    623 
    624 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
    625 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
    626 
    627 	aese	q0, v19.16b
    628 	aesmc	q0, q0          @ AES block 4k+4 - round 1
    629 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
    630 
    631 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
    632 
    633 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
    634 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
    635 
    636 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
    637 
    638 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
    639 
    640 	aese	q2, v19.16b
    641 	aesmc	q2, q2          @ AES block 4k+6 - round 1
    642 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
    643 
    644 	aese	q0, v20.16b
    645 	aesmc	q0, q0          @ AES block 4k+4 - round 2
    646 
    647 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
    648 	movi	q8, #0xc2
    649 
    650 	aese	q2, v20.16b
    651 	aesmc	q2, q2          @ AES block 4k+6 - round 2
    652 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
    653 
    654 	aese	q3, v20.16b
    655 	aesmc	q3, q3          @ AES block 4k+7 - round 2
    656 
    657 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
    658 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
    659 
    660 	aese	q2, v21.16b
    661 	aesmc	q2, q2          @ AES block 4k+6 - round 3
    662 
    663 	aese	q1, v20.16b
    664 	aesmc	q1, q1          @ AES block 4k+5 - round 2
    665 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
    666 
    667 	aese	q0, v21.16b
    668 	aesmc	q0, q0          @ AES block 4k+4 - round 3
    669 
    670 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
    671 	shl	d8, d8, #56               @ mod_constant
    672 
    673 	aese	q1, v21.16b
    674 	aesmc	q1, q1          @ AES block 4k+5 - round 3
    675 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
    676 
    677 	aese	q0, v22.16b
    678 	aesmc	q0, q0          @ AES block 4k+4 - round 4
    679 
    680 	pmull	v28.1q, q9, q8
    681 	eor	v10.16b, v10.16b, q9                         @ karatsuba tidy up
    682 
    683 	aese	q1, v22.16b
    684 	aesmc	q1, q1          @ AES block 4k+5 - round 4
    685 
    686 	aese	q0, v23.16b
    687 	aesmc	q0, q0          @ AES block 4k+4 - round 5
    688 	ext	q9, q9, q9, #8
    689 
    690 	aese	q3, v21.16b
    691 	aesmc	q3, q3          @ AES block 4k+7 - round 3
    692 
    693 	aese	q2, v22.16b
    694 	aesmc	q2, q2          @ AES block 4k+6 - round 4
    695 	eor	v10.16b, v10.16b, v11.16b
    696 
    697 	aese	q0, v24.16b
    698 	aesmc	q0, q0          @ AES block 4k+4 - round 6
    699 
    700 	aese	q3, v22.16b
    701 	aesmc	q3, q3          @ AES block 4k+7 - round 4
    702 
    703 	aese	q1, v23.16b
    704 	aesmc	q1, q1          @ AES block 4k+5 - round 5
    705 
    706 	aese	q2, v23.16b
    707 	aesmc	q2, q2          @ AES block 4k+6 - round 5
    708 	eor	v10.16b, v10.16b, v28.16b
    709 
    710 	aese	q3, v23.16b
    711 	aesmc	q3, q3          @ AES block 4k+7 - round 5
    712 
    713 	aese	q1, v24.16b
    714 	aesmc	q1, q1          @ AES block 4k+5 - round 6
    715 
    716 	aese	q2, v24.16b
    717 	aesmc	q2, q2          @ AES block 4k+6 - round 6
    718 
    719 	aese	q3, v24.16b
    720 	aesmc	q3, q3          @ AES block 4k+7 - round 6
    721 	eor	v10.16b, v10.16b, q9
    722 
    723 	aese	q0, v25.16b
    724 	aesmc	q0, q0          @ AES block 4k+4 - round 7
    725 
    726 	aese	q2, v25.16b
    727 	aesmc	q2, q2          @ AES block 4k+6 - round 7
    728 
    729 	aese	q3, v25.16b
    730 	aesmc	q3, q3          @ AES block 4k+7 - round 7
    731 
    732 	pmull	v28.1q, v10.1d, q8
    733 
    734 	aese	q1, v25.16b
    735 	aesmc	q1, q1          @ AES block 4k+5 - round 7
    736 	ext	v10.16b, v10.16b, v10.16b, #8
    737 
    738 	aese	q3, v26.16b
    739 	aesmc	q3, q3          @ AES block 4k+7 - round 8
    740 
    741 	aese	q0, v26.16b
    742 	aesmc	q0, q0          @ AES block 4k+4 - round 8
    743 	eor	v11.16b, v11.16b, v28.16b
    744 
    745 	aese	q1, v26.16b
    746 	aesmc	q1, q1          @ AES block 4k+5 - round 8
    747 
    748 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
    749 
    750 	aese	q2, v26.16b
    751 	aesmc	q2, q2          @ AES block 4k+6 - round 8
    752 
    753 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
    754 
    755 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
    756 	eor	v11.16b, v11.16b, v10.16b
    757 
    758 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
    759 .L128_enc_tail:@ TAIL
    760 
    761 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
    762 	ldp	r6, r7, [r0], #16           @ AES block 4k+4 - load plaintext
    763 #ifdef __ARMEB__
    764 	rev	r6, r6
    765 	rev	r7, r7
    766 #endif
    767 	cmp	r5, #48
    768 
    769 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
    770 	eor	r6, r6, r13                     @ AES block 4k+4 - round 10 low
    771 	eor	r7, r7, r14                     @ AES block 4k+4 - round 10 high
    772 
    773 	fmov	d4, r6                               @ AES block 4k+4 - mov low
    774 
    775 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
    776 
    777 	eor	q5, q4, q0                          @ AES block 4k+4 - result
    778 
    779 	bgt	.L128_enc_blocks_more_than_3
    780 
    781 	sub	r12, r12, #1
    782 	movi	v11.8b, #0
    783 	mov	q3, q2
    784 
    785 	cmp	r5, #32
    786 	mov	q2, q1
    787 	movi	q9, #0
    788 
    789 	movi	v10.8b, #0
    790 	bgt	.L128_enc_blocks_more_than_2
    791 
    792 	mov	q3, q1
    793 	cmp	r5, #16
    794 
    795 	sub	r12, r12, #1
    796 	bgt	.L128_enc_blocks_more_than_1
    797 
    798 	sub	r12, r12, #1
    799 	b	.L128_enc_blocks_less_than_1
    800 .L128_enc_blocks_more_than_3:@ blocks left >  3
    801 	st1	{ q5}, [r2], #16                     @ AES final-3 block  - store result
    802 
    803 	ldp	r6, r7, [r0], #16           @ AES final-2 block - load input low & high
    804 #ifdef __ARMEB__
    805 	rev	r6, r6
    806 	rev	r7, r7
    807 #endif
    808 	rev64	q4, q5                                    @ GHASH final-3 block
    809 
    810 	eor	q4, q4, q8                           @ feed in partial tag
    811 	eor	r7, r7, r14                     @ AES final-2 block - round 10 high
    812 	eor	r6, r6, r13                     @ AES final-2 block - round 10 low
    813 
    814 	fmov	d5, r6                                 @ AES final-2 block - mov low
    815 
    816 	movi	q8, #0                                        @ suppress further partial tag feed in
    817 	fmov	v5.d[1], r7                             @ AES final-2 block - mov high
    818 
    819 	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
    820 	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
    821 
    822 	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
    823 
    824 	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
    825 
    826 	eor	q5, q5, q1                            @ AES final-2 block - result
    827 	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
    828 
    829 	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
    830 .L128_enc_blocks_more_than_2:@ blocks left >  2
    831 
    832 	st1	{ q5}, [r2], #16                     @ AES final-2 block - store result
    833 
    834 	rev64	q4, q5                                    @ GHASH final-2 block
    835 	ldp	r6, r7, [r0], #16           @ AES final-1 block - load input low & high
    836 #ifdef __ARMEB__
    837 	rev	r6, r6
    838 	rev	r7, r7
    839 #endif
    840 	eor	q4, q4, q8                           @ feed in partial tag
    841 
    842 	eor	r6, r6, r13                     @ AES final-1 block - round 10 low
    843 
    844 	fmov	d5, r6                                 @ AES final-1 block - mov low
    845 	eor	r7, r7, r14                     @ AES final-1 block - round 10 high
    846 
    847 	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
    848 	fmov	v5.d[1], r7                             @ AES final-1 block - mov high
    849 
    850 	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
    851 
    852 	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
    853 
    854 	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
    855 
    856 	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
    857 
    858 	eor	q5, q5, q2                            @ AES final-1 block - result
    859 
    860 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
    861 
    862 	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
    863 
    864 	movi	q8, #0                                        @ suppress further partial tag feed in
    865 
    866 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
    867 .L128_enc_blocks_more_than_1:@ blocks left >  1
    868 
    869 	st1	{ q5}, [r2], #16                     @ AES final-1 block - store result
    870 
    871 	rev64	q4, q5                                    @ GHASH final-1 block
    872 	ldp	r6, r7, [r0], #16           @ AES final block - load input low & high
    873 #ifdef __ARMEB__
    874 	rev	r6, r6
    875 	rev	r7, r7
    876 #endif
    877 	eor	q4, q4, q8                           @ feed in partial tag
    878 
    879 	eor	r7, r7, r14                     @ AES final block - round 10 high
    880 	eor	r6, r6, r13                     @ AES final block - round 10 low
    881 
    882 	fmov	d5, r6                                 @ AES final block - mov low
    883 
    884 	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
    885 	fmov	v5.d[1], r7                             @ AES final block - mov high
    886 
    887 	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
    888 
    889 	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
    890 
    891 	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
    892 
    893 	eor	q5, q5, q3                            @ AES final block - result
    894 
    895 	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
    896 
    897 	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
    898 
    899 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
    900 
    901 	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
    902 
    903 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
    904 	movi	q8, #0                                        @ suppress further partial tag feed in
    905 .L128_enc_blocks_less_than_1:@ blocks left <= 1
    906 
    907 	and	r1, r1, #127                    @ bit_length %= 128
    908 	mvn	r13, xzr                                      @ rk10_l = 0xffffffffffffffff
    909 
    910 	mvn	r14, xzr                                      @ rk10_h = 0xffffffffffffffff
    911 	sub	r1, r1, #128                    @ bit_length -= 128
    912 
    913 	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
    914 
    915 	and	r1, r1, #127                    @ bit_length %= 128
    916 
    917 	lsr	r14, r14, r1                     @ rk10_h is mask for top 64b of last block
    918 	cmp	r1, #64
    919 
    920 	csel	r6, r13, r14, lt
    921 	csel	r7, r14, xzr, lt
    922 
    923 	fmov	d0, r6                                 @ ctr0b is mask for last block
    924 
    925 	fmov	v0.d[1], r7
    926 
    927 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
    928 
    929 	rev64	q4, q5                                    @ GHASH final block
    930 
    931 	eor	q4, q4, q8                           @ feed in partial tag
    932 
    933 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
    934 
    935 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
    936 	ld1	{ v18.16b}, [r2]                            @ load existing bytes where the possibly partial last block is to be stored
    937 
    938 	eor	q8, q8, q4                          @ GHASH final block - mid
    939 #ifndef __ARMEB__
    940 	rev	r9, r12
    941 #else
    942 	mov	r9, r12
    943 #endif
    944 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
    945 
    946 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
    947 
    948 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
    949 
    950 	eor	q9, q9, v20.16b                            @ GHASH final block - high
    951 
    952 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
    953 	movi	q8, #0xc2
    954 
    955 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
    956 
    957 	shl	d8, d8, #56               @ mod_constant
    958 
    959 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
    960 
    961 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
    962 
    963 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
    964 
    965 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
    966 
    967 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
    968 
    969 	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
    970 
    971 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
    972 
    973 	bif	q5, v18.16b, q0                              @ insert existing bytes in top end of result before storing
    974 
    975 	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
    976 	st1	{ q5}, [r2]                          @ store all 16B
    977 
    978 	str	r9, [r16, #12]                          @ store the updated counter
    979 
    980 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
    981 	ext	v11.16b, v11.16b, v11.16b, #8
    982 	rev64	v11.16b, v11.16b
    983 	mov	r0, r15
    984 	st1	{ v11.16b }, [r3]
    985 	ldp	r21, r22, [sp, #16]
    986 	ldp	r23, r24, [sp, #32]
    987 	ldp	d8, d9, [sp, #48]
    988 	ldp	d10, d11, [sp, #64]
    989 	ldp	d12, d13, [sp, #80]
    990 	ldp	d14, d15, [sp, #96]
    991 	ldp	r19, r20, [sp], #112
    992 	RET
    993 
    994 .L128_enc_ret:
    995 	mov	r0, #0x0
    996 	RET
    997 .size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
    998 .globl	aes_gcm_dec_128_kernel
    999 .type	aes_gcm_dec_128_kernel,%function
   1000 .align	4
   1001 aes_gcm_dec_128_kernel:
   1002 	cbz	r1, .L128_dec_ret
   1003 	stp	r19, r20, [sp, #-112]!
   1004 	mov	r16, r4
   1005 	mov	r8, r5
   1006 	stp	r21, r22, [sp, #16]
   1007 	stp	r23, r24, [sp, #32]
   1008 	stp	d8, d9, [sp, #48]
   1009 	stp	d10, d11, [sp, #64]
   1010 	stp	d12, d13, [sp, #80]
   1011 	stp	d14, d15, [sp, #96]
   1012 
   1013 	lsr	r5, r1, #3              @ byte_len
   1014 	mov	r15, r5
   1015 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   1016 #ifdef __ARMEB__
   1017 	rev	r10, r10
   1018 	rev	r11, r11
   1019 #endif
   1020 	ldp	r13, r14, [r8, #160]                     @ load rk10
   1021 #ifdef __ARMEB__
   1022 	ror	r14, r14, 32
   1023 	ror	r13, r13, 32
   1024 #endif
   1025 	sub	r5, r5, #1      @ byte_len - 1
   1026 	ld1	{v18.4s}, [r8], #16                                @ load rk0
   1027 
   1028 	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   1029 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   1030 
   1031 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   1032 #ifndef __ARMEB__
   1033 	ext	v13.16b, v13.16b, v13.16b, #8
   1034 #endif
   1035 	lsr	r12, r11, #32
   1036 	fmov	d2, r10                               @ CTR block 2
   1037 
   1038 	ld1	{v19.4s}, [r8], #16                                @ load rk1
   1039 	orr	r11, r11, r11
   1040 	rev	r12, r12                                @ rev_ctr32
   1041 
   1042 	fmov	d1, r10                               @ CTR block 1
   1043 	add	r12, r12, #1                            @ increment rev_ctr32
   1044 
   1045 	aese	q0, v18.16b
   1046 	aesmc	q0, q0          @ AES block 0 - round 0
   1047 	rev	r9, r12                                 @ CTR block 1
   1048 
   1049 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   1050 	ld1	{v20.4s}, [r8], #16                                @ load rk2
   1051 	add	r12, r12, #1                            @ CTR block 1
   1052 
   1053 	fmov	v1.d[1], r9                               @ CTR block 1
   1054 	rev	r9, r12                                 @ CTR block 2
   1055 	add	r12, r12, #1                            @ CTR block 2
   1056 
   1057 	aese	q0, v19.16b
   1058 	aesmc	q0, q0          @ AES block 0 - round 1
   1059 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   1060 
   1061 	fmov	v2.d[1], r9                               @ CTR block 2
   1062 	rev	r9, r12                                 @ CTR block 3
   1063 
   1064 	fmov	d3, r10                               @ CTR block 3
   1065 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   1066 	add	r12, r12, #1                            @ CTR block 3
   1067 
   1068 	fmov	v3.d[1], r9                               @ CTR block 3
   1069 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   1070 
   1071 	aese	q1, v18.16b
   1072 	aesmc	q1, q1          @ AES block 1 - round 0
   1073 	ld1	{v21.4s}, [r8], #16                                @ load rk3
   1074 
   1075 	aese	q0, v20.16b
   1076 	aesmc	q0, q0          @ AES block 0 - round 2
   1077 	ld1	{v22.4s}, [r8], #16                                @ load rk4
   1078 
   1079 	aese	q2, v18.16b
   1080 	aesmc	q2, q2          @ AES block 2 - round 0
   1081 	ld1	{v23.4s}, [r8], #16                                @ load rk5
   1082 
   1083 	aese	q1, v19.16b
   1084 	aesmc	q1, q1          @ AES block 1 - round 1
   1085 	ld1	{v24.4s}, [r8], #16                                @ load rk6
   1086 
   1087 	aese	q3, v18.16b
   1088 	aesmc	q3, q3          @ AES block 3 - round 0
   1089 
   1090 	aese	q2, v19.16b
   1091 	aesmc	q2, q2          @ AES block 2 - round 1
   1092 
   1093 	aese	q1, v20.16b
   1094 	aesmc	q1, q1          @ AES block 1 - round 2
   1095 
   1096 	aese	q3, v19.16b
   1097 	aesmc	q3, q3          @ AES block 3 - round 1
   1098 	ld1	{ v11.16b}, [r3]
   1099 	ext	v11.16b, v11.16b, v11.16b, #8
   1100 	rev64	v11.16b, v11.16b
   1101 
   1102 	aese	q0, v21.16b
   1103 	aesmc	q0, q0          @ AES block 0 - round 3
   1104 	ld1	{v25.4s}, [r8], #16                                @ load rk7
   1105 
   1106 	aese	q1, v21.16b
   1107 	aesmc	q1, q1          @ AES block 1 - round 3
   1108 
   1109 	aese	q3, v20.16b
   1110 	aesmc	q3, q3          @ AES block 3 - round 2
   1111 
   1112 	aese	q2, v20.16b
   1113 	aesmc	q2, q2          @ AES block 2 - round 2
   1114 	ld1	{v26.4s}, [r8], #16                                @ load rk8
   1115 
   1116 	aese	q1, v22.16b
   1117 	aesmc	q1, q1          @ AES block 1 - round 4
   1118 
   1119 	aese	q3, v21.16b
   1120 	aesmc	q3, q3          @ AES block 3 - round 3
   1121 
   1122 	aese	q2, v21.16b
   1123 	aesmc	q2, q2          @ AES block 2 - round 3
   1124 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   1125 #ifndef __ARMEB__
   1126 	ext	v14.16b, v14.16b, v14.16b, #8
   1127 #endif
   1128 	aese	q0, v22.16b
   1129 	aesmc	q0, q0          @ AES block 0 - round 4
   1130 	ld1	{v27.4s}, [r8], #16                                @ load rk9
   1131 
   1132 	aese	q1, v23.16b
   1133 	aesmc	q1, q1          @ AES block 1 - round 5
   1134 
   1135 	aese	q2, v22.16b
   1136 	aesmc	q2, q2          @ AES block 2 - round 4
   1137 
   1138 	aese	q3, v22.16b
   1139 	aesmc	q3, q3          @ AES block 3 - round 4
   1140 
   1141 	aese	q0, v23.16b
   1142 	aesmc	q0, q0          @ AES block 0 - round 5
   1143 
   1144 	aese	q2, v23.16b
   1145 	aesmc	q2, q2          @ AES block 2 - round 5
   1146 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   1147 #ifndef __ARMEB__
   1148 	ext	v12.16b, v12.16b, v12.16b, #8
   1149 #endif
   1150 	aese	q3, v23.16b
   1151 	aesmc	q3, q3          @ AES block 3 - round 5
   1152 
   1153 	aese	q0, v24.16b
   1154 	aesmc	q0, q0          @ AES block 0 - round 6
   1155 
   1156 	aese	q1, v24.16b
   1157 	aesmc	q1, q1          @ AES block 1 - round 6
   1158 
   1159 	aese	q3, v24.16b
   1160 	aesmc	q3, q3          @ AES block 3 - round 6
   1161 
   1162 	aese	q2, v24.16b
   1163 	aesmc	q2, q2          @ AES block 2 - round 6
   1164 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   1165 
   1166 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   1167 #ifndef __ARMEB__
   1168 	ext	v15.16b, v15.16b, v15.16b, #8
   1169 #endif
   1170 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   1171 	add	r5, r5, r0
   1172 
   1173 	aese	q1, v25.16b
   1174 	aesmc	q1, q1          @ AES block 1 - round 7
   1175 
   1176 	aese	q2, v25.16b
   1177 	aesmc	q2, q2          @ AES block 2 - round 7
   1178 
   1179 	aese	q0, v25.16b
   1180 	aesmc	q0, q0          @ AES block 0 - round 7
   1181 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   1182 
   1183 	aese	q3, v25.16b
   1184 	aesmc	q3, q3          @ AES block 3 - round 7
   1185 
   1186 	aese	q1, v26.16b
   1187 	aesmc	q1, q1          @ AES block 1 - round 8
   1188 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   1189 
   1190 	aese	q2, v26.16b
   1191 	aesmc	q2, q2          @ AES block 2 - round 8
   1192 
   1193 	aese	q3, v26.16b
   1194 	aesmc	q3, q3          @ AES block 3 - round 8
   1195 
   1196 	aese	q0, v26.16b
   1197 	aesmc	q0, q0          @ AES block 0 - round 8
   1198 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   1199 
   1200 	aese	q2, v27.16b                                      @ AES block 2 - round 9
   1201 
   1202 	aese	q3, v27.16b                                      @ AES block 3 - round 9
   1203 
   1204 	aese	q0, v27.16b                                      @ AES block 0 - round 9
   1205 	cmp	r0, r5                   @ check if we have <= 4 blocks
   1206 
   1207 	aese	q1, v27.16b                                      @ AES block 1 - round 9
   1208 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   1209 	bge	.L128_dec_tail                                    @ handle tail
   1210 
   1211 	ld1	{q4, q5}, [r0], #32               @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
   1212 
   1213 	eor	q1, q5, q1                            @ AES block 1 - result
   1214 	ld1	{q6}, [r0], #16                       @ AES block 2 - load ciphertext
   1215 
   1216 	eor	q0, q4, q0                            @ AES block 0 - result
   1217 	rev64	q4, q4                                    @ GHASH block 0
   1218 	rev	r9, r12                                 @ CTR block 4
   1219 
   1220 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   1221 	add	r12, r12, #1                            @ CTR block 4
   1222 	ld1	{q7}, [r0], #16                       @ AES block 3 - load ciphertext
   1223 
   1224 	rev64	q5, q5                                    @ GHASH block 1
   1225 	mov	r19, v1.d[0]                            @ AES block 1 - mov low
   1226 
   1227 	mov	r20, v1.d[1]                            @ AES block 1 - mov high
   1228 
   1229 	mov	r6, v0.d[0]                            @ AES block 0 - mov low
   1230 	cmp	r0, r5                   @ check if we have <= 8 blocks
   1231 
   1232 	mov	r7, v0.d[1]                            @ AES block 0 - mov high
   1233 
   1234 	fmov	d0, r10                               @ CTR block 4
   1235 
   1236 	fmov	v0.d[1], r9                               @ CTR block 4
   1237 	rev	r9, r12                                 @ CTR block 5
   1238 	eor	r19, r19, r13                   @ AES block 1 - round 10 low
   1239 #ifdef __ARMEB__
   1240 	rev	r19, r19
   1241 #endif
   1242 	fmov	d1, r10                               @ CTR block 5
   1243 	add	r12, r12, #1                            @ CTR block 5
   1244 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   1245 
   1246 	fmov	v1.d[1], r9                               @ CTR block 5
   1247 	rev	r9, r12                                 @ CTR block 6
   1248 	add	r12, r12, #1                            @ CTR block 6
   1249 
   1250 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   1251 
   1252 	eor	r20, r20, r14                   @ AES block 1 - round 10 high
   1253 #ifdef __ARMEB__
   1254 	rev	r20, r20
   1255 #endif
   1256 	eor	r6, r6, r13                   @ AES block 0 - round 10 low
   1257 #ifdef __ARMEB__
   1258 	rev	r6, r6
   1259 #endif
   1260 	eor	q2, q6, q2                            @ AES block 2 - result
   1261 
   1262 	eor	r7, r7, r14                   @ AES block 0 - round 10 high
   1263 #ifdef __ARMEB__
   1264 	rev	r7, r7
   1265 #endif
   1266 	stp	r6, r7, [r2], #16        @ AES block 0 - store result
   1267 
   1268 	stp	r19, r20, [r2], #16        @ AES block 1 - store result
   1269 	bge	.L128_dec_prepretail                              @ do prepretail
   1270 
   1271 .L128_dec_main_loop:@ main loop start
   1272 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   1273 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   1274 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   1275 
   1276 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   1277 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   1278 
   1279 	aese	q1, v18.16b
   1280 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   1281 	fmov	d2, r10                               @ CTR block 4k+6
   1282 
   1283 	rev64	q6, q6                                    @ GHASH block 4k+2
   1284 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   1285 	rev	r9, r12                                 @ CTR block 4k+7
   1286 
   1287 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   1288 	eor	q4, q4, v11.16b                           @ PRE 1
   1289 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
   1290 
   1291 	aese	q1, v19.16b
   1292 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   1293 	rev64	q7, q7                                    @ GHASH block 4k+3
   1294 
   1295 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   1296 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   1297 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   1298 
   1299 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   1300 	fmov	d3, r10                               @ CTR block 4k+7
   1301 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
   1302 
   1303 	aese	q1, v20.16b
   1304 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   1305 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   1306 
   1307 	aese	q2, v18.16b
   1308 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   1309 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   1310 
   1311 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   1312 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
   1313 
   1314 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   1315 
   1316 	aese	q1, v21.16b
   1317 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   1318 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   1319 
   1320 	aese	q3, v18.16b
   1321 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   1322 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
   1323 
   1324 	aese	q0, v18.16b
   1325 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   1326 
   1327 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   1328 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   1329 
   1330 	aese	q3, v19.16b
   1331 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   1332 	eor	r23, r23, r13                   @ AES block 4k+3 - round 10 low
   1333 #ifdef __ARMEB__
   1334 	rev	r23, r23
   1335 #endif
   1336 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
   1337 	eor	r22, r22, r14                   @ AES block 4k+2 - round 10 high
   1338 #ifdef __ARMEB__
   1339 	rev	r22, r22
   1340 #endif
   1341 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   1342 
   1343 	aese	q0, v19.16b
   1344 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   1345 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
   1346 
   1347 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   1348 
   1349 	aese	q3, v20.16b
   1350 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   1351 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   1352 
   1353 	aese	q0, v20.16b
   1354 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   1355 
   1356 	aese	q1, v22.16b
   1357 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   1358 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
   1359 
   1360 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   1361 
   1362 	aese	q0, v21.16b
   1363 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   1364 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   1365 
   1366 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   1367 
   1368 	aese	q2, v19.16b
   1369 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   1370 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   1371 
   1372 	aese	q0, v22.16b
   1373 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   1374 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
   1375 
   1376 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   1377 	eor	r24, r24, r14                   @ AES block 4k+3 - round 10 high
   1378 #ifdef __ARMEB__
   1379 	rev	r24, r24
   1380 #endif
   1381 	aese	q2, v20.16b
   1382 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   1383 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   1384 
   1385 	aese	q1, v23.16b
   1386 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   1387 	eor	r21, r21, r13                   @ AES block 4k+2 - round 10 low
   1388 #ifdef __ARMEB__
   1389 	rev	r21, r21
   1390 #endif
   1391 	aese	q0, v23.16b
   1392 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   1393 	movi	q8, #0xc2
   1394 
   1395 	aese	q2, v21.16b
   1396 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   1397 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
   1398 
   1399 	aese	q1, v24.16b
   1400 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   1401 
   1402 	aese	q0, v24.16b
   1403 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   1404 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   1405 
   1406 	aese	q2, v22.16b
   1407 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   1408 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   1409 
   1410 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   1411 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
   1412 	ld1	{q4}, [r0], #16                       @ AES block 4k+3 - load ciphertext
   1413 
   1414 	aese	q1, v25.16b
   1415 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   1416 	add	r12, r12, #1                            @ CTR block 4k+7
   1417 
   1418 	aese	q0, v25.16b
   1419 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   1420 	shl	d8, d8, #56               @ mod_constant
   1421 
   1422 	aese	q2, v23.16b
   1423 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   1424 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   1425 
   1426 	aese	q1, v26.16b
   1427 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   1428 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   1429 
   1430 	aese	q0, v26.16b
   1431 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   1432 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   1433 
   1434 	aese	q3, v21.16b
   1435 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   1436 	rev	r9, r12                                 @ CTR block 4k+8
   1437 
   1438 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   1439 	ld1	{q5}, [r0], #16                       @ AES block 4k+4 - load ciphertext
   1440 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   1441 
   1442 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
   1443 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   1444 
   1445 	aese	q3, v22.16b
   1446 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   1447 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   1448 
   1449 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
   1450 
   1451 	aese	q2, v24.16b
   1452 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   1453 	eor	q0, q4, q0                            @ AES block 4k+4 - result
   1454 
   1455 	aese	q3, v23.16b
   1456 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   1457 	ld1	{q6}, [r0], #16                       @ AES block 4k+5 - load ciphertext
   1458 
   1459 	add	r12, r12, #1                            @ CTR block 4k+8
   1460 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   1461 	eor	q1, q5, q1                            @ AES block 4k+5 - result
   1462 
   1463 	aese	q2, v25.16b
   1464 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   1465 	ld1	{q7}, [r0], #16                       @ AES block 4k+6 - load ciphertext
   1466 
   1467 	aese	q3, v24.16b
   1468 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   1469 
   1470 	rev64	q5, q5                                    @ GHASH block 4k+5
   1471 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   1472 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   1473 
   1474 	aese	q2, v26.16b
   1475 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   1476 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   1477 
   1478 	aese	q3, v25.16b
   1479 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   1480 	fmov	d0, r10                               @ CTR block 4k+8
   1481 
   1482 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   1483 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   1484 	rev	r9, r12                                 @ CTR block 4k+9
   1485 
   1486 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
   1487 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   1488 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   1489 
   1490 	aese	q3, v26.16b
   1491 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   1492 	eor	r7, r7, r14                   @ AES block 4k+4 - round 10 high
   1493 #ifdef __ARMEB__
   1494 	rev	r7, r7
   1495 #endif
   1496 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   1497 	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
   1498 	eor	r6, r6, r13                   @ AES block 4k+4 - round 10 low
   1499 #ifdef __ARMEB__
   1500 	rev	r6, r6
   1501 #endif
   1502 	eor	q2, q6, q2                            @ AES block 4k+6 - result
   1503 	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
   1504 	add	r12, r12, #1                            @ CTR block 4k+9
   1505 
   1506 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
   1507 	fmov	d1, r10                               @ CTR block 4k+9
   1508 	cmp	r0, r5                   @ .LOOP CONTROL
   1509 
   1510 	rev64	q4, q4                                    @ GHASH block 4k+4
   1511 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   1512 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   1513 
   1514 	rev	r9, r12                                 @ CTR block 4k+10
   1515 	add	r12, r12, #1                            @ CTR block 4k+10
   1516 
   1517 	eor	r20, r20, r14                   @ AES block 4k+5 - round 10 high
   1518 #ifdef __ARMEB__
   1519 	rev	r20, r20
   1520 #endif
   1521 	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
   1522 
   1523 	eor	r19, r19, r13                   @ AES block 4k+5 - round 10 low
   1524 #ifdef __ARMEB__
   1525 	rev	r19, r19
   1526 #endif
   1527 	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
   1528 
   1529 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   1530 	blt	.L128_dec_main_loop
   1531 
   1532 .L128_dec_prepretail:@ PREPRETAIL
   1533 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   1534 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   1535 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
   1536 
   1537 	aese	q0, v18.16b
   1538 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   1539 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   1540 
   1541 	aese	q1, v18.16b
   1542 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   1543 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   1544 
   1545 	eor	q4, q4, v11.16b                           @ PRE 1
   1546 	fmov	d2, r10                               @ CTR block 4k+6
   1547 	rev64	q6, q6                                    @ GHASH block 4k+2
   1548 
   1549 	aese	q0, v19.16b
   1550 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   1551 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   1552 
   1553 	rev	r9, r12                                 @ CTR block 4k+7
   1554 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   1555 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
   1556 
   1557 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   1558 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   1559 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   1560 
   1561 	aese	q1, v19.16b
   1562 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   1563 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   1564 
   1565 	aese	q0, v20.16b
   1566 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   1567 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   1568 
   1569 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   1570 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   1571 	fmov	d3, r10                               @ CTR block 4k+7
   1572 
   1573 	aese	q2, v18.16b
   1574 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   1575 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   1576 
   1577 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
   1578 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   1579 
   1580 	rev64	q7, q7                                    @ GHASH block 4k+3
   1581 
   1582 	aese	q2, v19.16b
   1583 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   1584 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   1585 
   1586 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   1587 
   1588 	aese	q3, v18.16b
   1589 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   1590 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   1591 
   1592 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   1593 
   1594 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   1595 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
   1596 
   1597 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   1598 
   1599 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   1600 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
   1601 
   1602 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
   1603 
   1604 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   1605 
   1606 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   1607 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   1608 
   1609 	aese	q1, v20.16b
   1610 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   1611 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   1612 
   1613 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   1614 
   1615 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
   1616 	movi	q8, #0xc2
   1617 
   1618 	aese	q3, v19.16b
   1619 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   1620 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   1621 
   1622 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
   1623 
   1624 	aese	q2, v20.16b
   1625 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   1626 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
   1627 
   1628 	aese	q3, v20.16b
   1629 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   1630 	eor	r23, r23, r13                   @ AES block 4k+3 - round 10 low
   1631 #ifdef __ARMEB__
   1632 	rev	r23, r23
   1633 #endif
   1634 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   1635 	eor	r21, r21, r13                   @ AES block 4k+2 - round 10 low
   1636 #ifdef __ARMEB__
   1637 	rev	r21, r21
   1638 #endif
   1639 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
   1640 
   1641 	aese	q2, v21.16b
   1642 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   1643 
   1644 	aese	q1, v21.16b
   1645 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   1646 	shl	d8, d8, #56               @ mod_constant
   1647 
   1648 	aese	q0, v21.16b
   1649 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   1650 
   1651 	aese	q2, v22.16b
   1652 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   1653 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   1654 
   1655 	aese	q1, v22.16b
   1656 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   1657 
   1658 	aese	q3, v21.16b
   1659 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   1660 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   1661 
   1662 	aese	q2, v23.16b
   1663 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   1664 
   1665 	aese	q1, v23.16b
   1666 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   1667 
   1668 	aese	q3, v22.16b
   1669 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   1670 
   1671 	aese	q0, v22.16b
   1672 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   1673 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   1674 
   1675 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   1676 
   1677 	aese	q1, v24.16b
   1678 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   1679 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   1680 
   1681 	aese	q3, v23.16b
   1682 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   1683 
   1684 	aese	q0, v23.16b
   1685 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   1686 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   1687 
   1688 	aese	q1, v25.16b
   1689 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   1690 
   1691 	aese	q2, v24.16b
   1692 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   1693 
   1694 	aese	q0, v24.16b
   1695 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   1696 
   1697 	aese	q1, v26.16b
   1698 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   1699 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   1700 
   1701 	aese	q3, v24.16b
   1702 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   1703 
   1704 	aese	q0, v25.16b
   1705 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   1706 
   1707 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
   1708 
   1709 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   1710 	eor	r24, r24, r14                   @ AES block 4k+3 - round 10 high
   1711 #ifdef __ARMEB__
   1712 	rev	r24, r24
   1713 #endif
   1714 	aese	q2, v25.16b
   1715 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   1716 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   1717 
   1718 	aese	q3, v25.16b
   1719 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   1720 
   1721 	aese	q0, v26.16b
   1722 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   1723 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   1724 
   1725 	aese	q2, v26.16b
   1726 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   1727 
   1728 	aese	q3, v26.16b
   1729 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   1730 	eor	r22, r22, r14                   @ AES block 4k+2 - round 10 high
   1731 #ifdef __ARMEB__
   1732 	rev	r22, r22
   1733 #endif
   1734 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
   1735 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   1736 
   1737 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
   1738 	add	r12, r12, #1                            @ CTR block 4k+7
   1739 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   1740 
   1741 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
   1742 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   1743 .L128_dec_tail:@ TAIL
   1744 
   1745 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   1746 	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
   1747 
   1748 	eor	q0, q5, q0                            @ AES block 4k+4 - result
   1749 
   1750 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   1751 
   1752 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   1753 
   1754 	cmp	r5, #48
   1755 
   1756 	eor	r7, r7, r14                   @ AES block 4k+4 - round 10 high
   1757 #ifdef __ARMEB__
   1758 	rev	r7, r7
   1759 #endif
   1760 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   1761 	eor	r6, r6, r13                   @ AES block 4k+4 - round 10 low
   1762 #ifdef __ARMEB__
   1763 	rev	r6, r6
   1764 #endif
   1765 	bgt	.L128_dec_blocks_more_than_3
   1766 
   1767 	mov	q3, q2
   1768 	sub	r12, r12, #1
   1769 	movi	v11.8b, #0
   1770 
   1771 	movi	q9, #0
   1772 	mov	q2, q1
   1773 
   1774 	movi	v10.8b, #0
   1775 	cmp	r5, #32
   1776 	bgt	.L128_dec_blocks_more_than_2
   1777 
   1778 	cmp	r5, #16
   1779 
   1780 	mov	q3, q1
   1781 	sub	r12, r12, #1
   1782 	bgt	.L128_dec_blocks_more_than_1
   1783 
   1784 	sub	r12, r12, #1
   1785 	b	.L128_dec_blocks_less_than_1
   1786 .L128_dec_blocks_more_than_3:@ blocks left >  3
   1787 	rev64	q4, q5                                    @ GHASH final-3 block
   1788 	ld1	{ q5}, [r0], #16                      @ AES final-2 block - load ciphertext
   1789 
   1790 	eor	q4, q4, q8                           @ feed in partial tag
   1791 
   1792 	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
   1793 	stp	r6, r7, [r2], #16        @ AES final-3 block  - store result
   1794 	eor	q0, q5, q1                            @ AES final-2 block - result
   1795 
   1796 	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
   1797 	mov	r7, v0.d[1]                            @ AES final-2 block - mov high
   1798 
   1799 	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
   1800 	mov	r6, v0.d[0]                            @ AES final-2 block - mov low
   1801 
   1802 	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
   1803 
   1804 	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
   1805 
   1806 	movi	q8, #0                                        @ suppress further partial tag feed in
   1807 	eor	r7, r7, r14                   @ AES final-2 block - round 10 high
   1808 #ifdef __ARMEB__
   1809 	rev	r7, r7
   1810 #endif
   1811 	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
   1812 	eor	r6, r6, r13                   @ AES final-2 block - round 10 low
   1813 #ifdef __ARMEB__
   1814 	rev	r6, r6
   1815 #endif
   1816 .L128_dec_blocks_more_than_2:@ blocks left >  2
   1817 
   1818 	rev64	q4, q5                                    @ GHASH final-2 block
   1819 	ld1	{ q5}, [r0], #16                      @ AES final-1 block - load ciphertext
   1820 
   1821 	eor	q4, q4, q8                           @ feed in partial tag
   1822 
   1823 	eor	q0, q5, q2                            @ AES final-1 block - result
   1824 	stp	r6, r7, [r2], #16        @ AES final-2 block  - store result
   1825 
   1826 	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
   1827 
   1828 	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
   1829 
   1830 	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
   1831 	mov	r6, v0.d[0]                            @ AES final-1 block - mov low
   1832 
   1833 	mov	r7, v0.d[1]                            @ AES final-1 block - mov high
   1834 	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
   1835 
   1836 	movi	q8, #0                                        @ suppress further partial tag feed in
   1837 
   1838 	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
   1839 
   1840 	eor	r6, r6, r13                   @ AES final-1 block - round 10 low
   1841 #ifdef __ARMEB__
   1842 	rev	r6, r6
   1843 #endif
   1844 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
   1845 
   1846 	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
   1847 
   1848 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
   1849 	eor	r7, r7, r14                   @ AES final-1 block - round 10 high
   1850 #ifdef __ARMEB__
   1851 	rev	r7, r7
   1852 #endif
   1853 .L128_dec_blocks_more_than_1:@ blocks left >  1
   1854 
   1855 	rev64	q4, q5                                    @ GHASH final-1 block
   1856 
   1857 	ld1	{ q5}, [r0], #16                      @ AES final block - load ciphertext
   1858 	eor	q4, q4, q8                           @ feed in partial tag
   1859 
   1860 	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
   1861 
   1862 	eor	q0, q5, q3                            @ AES final block - result
   1863 
   1864 	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
   1865 
   1866 	stp	r6, r7, [r2], #16        @ AES final-1 block  - store result
   1867 	mov	r6, v0.d[0]                            @ AES final block - mov low
   1868 
   1869 	mov	r7, v0.d[1]                            @ AES final block - mov high
   1870 	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
   1871 
   1872 	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
   1873 
   1874 	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
   1875 
   1876 	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
   1877 	movi	q8, #0                                        @ suppress further partial tag feed in
   1878 
   1879 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
   1880 
   1881 	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
   1882 	eor	r7, r7, r14                   @ AES final block - round 10 high
   1883 #ifdef __ARMEB__
   1884 	rev	r7, r7
   1885 #endif
   1886 	eor	r6, r6, r13                   @ AES final block - round 10 low
   1887 #ifdef __ARMEB__
   1888 	rev	r6, r6
   1889 #endif
   1890 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
   1891 .L128_dec_blocks_less_than_1:@ blocks left <= 1
   1892 
   1893 	mvn	r14, xzr                                      @ rk10_h = 0xffffffffffffffff
   1894 	and	r1, r1, #127                    @ bit_length %= 128
   1895 
   1896 	mvn	r13, xzr                                      @ rk10_l = 0xffffffffffffffff
   1897 	sub	r1, r1, #128                    @ bit_length -= 128
   1898 
   1899 	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
   1900 
   1901 	and	r1, r1, #127                    @ bit_length %= 128
   1902 
   1903 	lsr	r14, r14, r1                     @ rk10_h is mask for top 64b of last block
   1904 	cmp	r1, #64
   1905 
   1906 	csel	r10, r14, xzr, lt
   1907 	csel	r9, r13, r14, lt
   1908 
   1909 	fmov	d0, r9                                   @ ctr0b is mask for last block
   1910 
   1911 	mov	v0.d[1], r10
   1912 
   1913 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
   1914 
   1915 	rev64	q4, q5                                    @ GHASH final block
   1916 
   1917 	eor	q4, q4, q8                           @ feed in partial tag
   1918 
   1919 	ldp	r4, r5, [r2] @ load existing bytes we need to not overwrite
   1920 
   1921 	and	r7, r7, r10
   1922 
   1923 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
   1924 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
   1925 
   1926 	eor	q8, q8, q4                          @ GHASH final block - mid
   1927 	eor	q9, q9, v20.16b                            @ GHASH final block - high
   1928 
   1929 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
   1930 
   1931 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
   1932 	bic	r4, r4, r9           @ mask out low existing bytes
   1933 	and	r6, r6, r9
   1934 
   1935 #ifndef __ARMEB__
   1936 	rev	r9, r12
   1937 #else
   1938 	mov	r9, r12
   1939 #endif
   1940 
   1941 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
   1942 	movi	q8, #0xc2
   1943 
   1944 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
   1945 
   1946 	bic	r5, r5, r10   @ mask out high existing bytes
   1947 	shl	d8, d8, #56               @ mod_constant
   1948 
   1949 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   1950 
   1951 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   1952 
   1953 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   1954 
   1955 	orr	r6, r6, r4
   1956 	str	r9, [r16, #12]                          @ store the updated counter
   1957 
   1958 	orr	r7, r7, r5
   1959 	stp	r6, r7, [r2]
   1960 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   1961 
   1962 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   1963 
   1964 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   1965 
   1966 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   1967 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   1968 
   1969 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   1970 
   1971 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   1972 	ext	v11.16b, v11.16b, v11.16b, #8
   1973 	rev64	v11.16b, v11.16b
   1974 	mov	r0, r15
   1975 	st1	{ v11.16b }, [r3]
   1976 
   1977 	ldp	r21, r22, [sp, #16]
   1978 	ldp	r23, r24, [sp, #32]
   1979 	ldp	d8, d9, [sp, #48]
   1980 	ldp	d10, d11, [sp, #64]
   1981 	ldp	d12, d13, [sp, #80]
   1982 	ldp	d14, d15, [sp, #96]
   1983 	ldp	r19, r20, [sp], #112
   1984 	RET
   1985 
   1986 .L128_dec_ret:
   1987 	mov	r0, #0x0
   1988 	RET
   1989 .size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
   1990 .globl	aes_gcm_enc_192_kernel
   1991 .type	aes_gcm_enc_192_kernel,%function
   1992 .align	4
   1993 aes_gcm_enc_192_kernel:
   1994 	cbz	r1, .L192_enc_ret
   1995 	stp	r19, r20, [sp, #-112]!
   1996 	mov	r16, r4
   1997 	mov	r8, r5
   1998 	stp	r21, r22, [sp, #16]
   1999 	stp	r23, r24, [sp, #32]
   2000 	stp	d8, d9, [sp, #48]
   2001 	stp	d10, d11, [sp, #64]
   2002 	stp	d12, d13, [sp, #80]
   2003 	stp	d14, d15, [sp, #96]
   2004 
   2005 	ldp	r10, r11, [r16]             @ ctr96_b64, ctr96_t32
   2006 #ifdef __ARMEB__
   2007 	rev	r10, r10
   2008 	rev	r11, r11
   2009 #endif
   2010 	ldp	r13, r14, [r8, #192]                     @ load rk12
   2011 #ifdef __ARMEB__
   2012 	ror	r13, r13, #32
   2013 	ror	r14, r14, #32
   2014 #endif
   2015 	ld1	{v18.4s}, [r8], #16	                             @ load rk0
   2016 
   2017 	ld1	{v19.4s}, [r8], #16	                             @ load rk1
   2018 
   2019 	ld1	{v20.4s}, [r8], #16	                             @ load rk2
   2020 
   2021 	lsr	r12, r11, #32
   2022 	ld1	{v21.4s}, [r8], #16	                             @ load rk3
   2023 	orr	r11, r11, r11
   2024 
   2025 	ld1	{v22.4s}, [r8], #16	                             @ load rk4
   2026 	rev	r12, r12                               @ rev_ctr32
   2027 
   2028 	add	r12, r12, #1                           @ increment rev_ctr32
   2029 	fmov	d3, r10                              @ CTR block 3
   2030 
   2031 	rev	r9, r12                                @ CTR block 1
   2032 	add	r12, r12, #1                           @ CTR block 1
   2033 	fmov	d1, r10                              @ CTR block 1
   2034 
   2035 	orr	r9, r11, r9, lsl #32           @ CTR block 1
   2036 	ld1	{ q0}, [r16]                            @ special case vector load initial counter so we can start first AES block as quickly as possible
   2037 
   2038 	fmov	v1.d[1], r9                              @ CTR block 1
   2039 	rev	r9, r12                                @ CTR block 2
   2040 	add	r12, r12, #1                           @ CTR block 2
   2041 
   2042 	fmov	d2, r10                              @ CTR block 2
   2043 	orr	r9, r11, r9, lsl #32           @ CTR block 2
   2044 
   2045 	fmov	v2.d[1], r9                              @ CTR block 2
   2046 	rev	r9, r12                                @ CTR block 3
   2047 
   2048 	orr	r9, r11, r9, lsl #32           @ CTR block 3
   2049 	ld1	{v23.4s}, [r8], #16	                             @ load rk5
   2050 
   2051 	fmov	v3.d[1], r9                              @ CTR block 3
   2052 
   2053 	ld1	{v24.4s}, [r8], #16	                             @ load rk6
   2054 
   2055 	ld1	{v25.4s}, [r8], #16	                             @ load rk7
   2056 
   2057 	aese	q0, v18.16b
   2058 	aesmc	q0, q0         @ AES block 0 - round 0
   2059 	ld1	{ v11.16b}, [r3]
   2060 	ext	v11.16b, v11.16b, v11.16b, #8
   2061 	rev64	v11.16b, v11.16b
   2062 
   2063 	aese	q3, v18.16b
   2064 	aesmc	q3, q3         @ AES block 3 - round 0
   2065 	ld1	{v26.4s}, [r8], #16	                             @ load rk8
   2066 
   2067 	aese	q1, v18.16b
   2068 	aesmc	q1, q1         @ AES block 1 - round 0
   2069 	ldr	q15, [r3, #112]                       @ load h4l | h4h
   2070 #ifndef __ARMEB__
   2071 	ext	v15.16b, v15.16b, v15.16b, #8
   2072 #endif
   2073 	aese	q2, v18.16b
   2074 	aesmc	q2, q2         @ AES block 2 - round 0
   2075 	ld1	{v27.4s}, [r8], #16	                             @ load rk9
   2076 
   2077 	aese	q0, v19.16b
   2078 	aesmc	q0, q0         @ AES block 0 - round 1
   2079 	ld1	{v28.4s}, [r8], #16	                         @ load rk10
   2080 
   2081 	aese	q1, v19.16b
   2082 	aesmc	q1, q1         @ AES block 1 - round 1
   2083 	ldr	q12, [r3, #32]                        @ load h1l | h1h
   2084 #ifndef __ARMEB__
   2085 	ext	v12.16b, v12.16b, v12.16b, #8
   2086 #endif
   2087 	aese	q2, v19.16b
   2088 	aesmc	q2, q2         @ AES block 2 - round 1
   2089 	ld1	{v29.4s}, [r8], #16	                         @ load rk11
   2090 
   2091 	aese	q3, v19.16b
   2092 	aesmc	q3, q3         @ AES block 3 - round 1
   2093 	ldr	q14, [r3, #80]                        @ load h3l | h3h
   2094 #ifndef __ARMEB__
   2095 	ext	v14.16b, v14.16b, v14.16b, #8
   2096 #endif
   2097 	aese	q0, v20.16b
   2098 	aesmc	q0, q0         @ AES block 0 - round 2
   2099 
   2100 	aese	q2, v20.16b
   2101 	aesmc	q2, q2         @ AES block 2 - round 2
   2102 
   2103 	aese	q3, v20.16b
   2104 	aesmc	q3, q3         @ AES block 3 - round 2
   2105 
   2106 	aese	q0, v21.16b
   2107 	aesmc	q0, q0         @ AES block 0 - round 3
   2108 	trn1	q9, v14.2d,    v15.2d                     @ h4h | h3h
   2109 
   2110 	aese	q2, v21.16b
   2111 	aesmc	q2, q2         @ AES block 2 - round 3
   2112 
   2113 	aese	q1, v20.16b
   2114 	aesmc	q1, q1         @ AES block 1 - round 2
   2115 	trn2	v17.2d,  v14.2d,    v15.2d                     @ h4l | h3l
   2116 
   2117 	aese	q0, v22.16b
   2118 	aesmc	q0, q0         @ AES block 0 - round 4
   2119 
   2120 	aese	q3, v21.16b
   2121 	aesmc	q3, q3         @ AES block 3 - round 3
   2122 
   2123 	aese	q1, v21.16b
   2124 	aesmc	q1, q1         @ AES block 1 - round 3
   2125 
   2126 	aese	q0, v23.16b
   2127 	aesmc	q0, q0         @ AES block 0 - round 5
   2128 
   2129 	aese	q2, v22.16b
   2130 	aesmc	q2, q2         @ AES block 2 - round 4
   2131 
   2132 	aese	q1, v22.16b
   2133 	aesmc	q1, q1         @ AES block 1 - round 4
   2134 
   2135 	aese	q0, v24.16b
   2136 	aesmc	q0, q0         @ AES block 0 - round 6
   2137 
   2138 	aese	q3, v22.16b
   2139 	aesmc	q3, q3         @ AES block 3 - round 4
   2140 
   2141 	aese	q2, v23.16b
   2142 	aesmc	q2, q2         @ AES block 2 - round 5
   2143 
   2144 	aese	q1, v23.16b
   2145 	aesmc	q1, q1         @ AES block 1 - round 5
   2146 
   2147 	aese	q3, v23.16b
   2148 	aesmc	q3, q3         @ AES block 3 - round 5
   2149 
   2150 	aese	q2, v24.16b
   2151 	aesmc	q2, q2         @ AES block 2 - round 6
   2152 	ldr	q13, [r3, #64]                        @ load h2l | h2h
   2153 #ifndef __ARMEB__
   2154 	ext	v13.16b, v13.16b, v13.16b, #8
   2155 #endif
   2156 	aese	q1, v24.16b
   2157 	aesmc	q1, q1         @ AES block 1 - round 6
   2158 
   2159 	aese	q3, v24.16b
   2160 	aesmc	q3, q3         @ AES block 3 - round 6
   2161 
   2162 	aese	q0, v25.16b
   2163 	aesmc	q0, q0         @ AES block 0 - round 7
   2164 
   2165 	aese	q1, v25.16b
   2166 	aesmc	q1, q1         @ AES block 1 - round 7
   2167 	trn2	v16.2d,  v12.2d,    v13.2d                     @ h2l | h1l
   2168 
   2169 	aese	q3, v25.16b
   2170 	aesmc	q3, q3         @ AES block 3 - round 7
   2171 
   2172 	aese	q0, v26.16b
   2173 	aesmc	q0, q0         @ AES block 0 - round 8
   2174 
   2175 	aese	q2, v25.16b
   2176 	aesmc	q2, q2         @ AES block 2 - round 7
   2177 	trn1	q8,    v12.2d,    v13.2d                     @ h2h | h1h
   2178 
   2179 	aese	q1, v26.16b
   2180 	aesmc	q1, q1         @ AES block 1 - round 8
   2181 
   2182 	aese	q3, v26.16b
   2183 	aesmc	q3, q3         @ AES block 3 - round 8
   2184 
   2185 	aese	q2, v26.16b
   2186 	aesmc	q2, q2         @ AES block 2 - round 8
   2187 
   2188 	aese	q0, v27.16b
   2189 	aesmc	q0, q0         @ AES block 0 - round 9
   2190 
   2191 	aese	q3, v27.16b
   2192 	aesmc	q3, q3         @ AES block 3 - round 9
   2193 
   2194 	aese	q2, v27.16b
   2195 	aesmc	q2, q2         @ AES block 2 - round 9
   2196 
   2197 	aese	q1, v27.16b
   2198 	aesmc	q1, q1         @ AES block 1 - round 9
   2199 
   2200 	aese	q0, v28.16b
   2201 	aesmc	q0, q0         @ AES block 0 - round 10
   2202 
   2203 	aese	q2, v28.16b
   2204 	aesmc	q2, q2         @ AES block 2 - round 10
   2205 
   2206 	aese	q1, v28.16b
   2207 	aesmc	q1, q1         @ AES block 1 - round 10
   2208 	lsr	r5, r1, #3             @ byte_len
   2209 	mov	r15, r5
   2210 
   2211 	aese	q3, v28.16b
   2212 	aesmc	q3, q3         @ AES block 3 - round 10
   2213 	sub	r5, r5, #1     @ byte_len - 1
   2214 
   2215 	eor	v16.16b, v16.16b, q8                    @ h2k | h1k
   2216 	and	r5, r5, #0xffffffffffffffc0   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   2217 
   2218 	eor	v17.16b, v17.16b, q9                 @ h4k | h3k
   2219 
   2220 	aese	q2, v29.16b                                    @ AES block 2 - round 11
   2221 	add	r4, r0, r1, lsr #3  @ end_input_ptr
   2222 	add	r5, r5, r0
   2223 
   2224 	aese	q1, v29.16b                                    @ AES block 1 - round 11
   2225 	cmp	r0, r5                  @ check if we have <= 4 blocks
   2226 
   2227 	aese	q0, v29.16b                                    @ AES block 0 - round 11
   2228 	add	r12, r12, #1                           @ CTR block 3
   2229 
   2230 	aese	q3, v29.16b                                    @ AES block 3 - round 11
   2231 	bge	.L192_enc_tail                                   @ handle tail
   2232 
   2233 	rev	r9, r12                                @ CTR block 4
   2234 	ldp	r6, r7, [r0, #0]           @ AES block 0 - load plaintext
   2235 #ifdef __ARMEB__
   2236 	rev	r6, r6
   2237 	rev	r7, r7
   2238 #endif
   2239 	orr	r9, r11, r9, lsl #32           @ CTR block 4
   2240 	ldp	r21, r22, [r0, #32]          @ AES block 2 - load plaintext
   2241 #ifdef __ARMEB__
   2242 	rev	r21, r21
   2243 	rev	r22, r22
   2244 #endif
   2245 	ldp	r23, r24, [r0, #48]          @ AES block 3 - load plaintext
   2246 #ifdef __ARMEB__
   2247 	rev	r23, r23
   2248 	rev	r24, r24
   2249 #endif
   2250 	ldp	r19, r20, [r0, #16]          @ AES block 1 - load plaintext
   2251 #ifdef __ARMEB__
   2252 	rev	r19, r19
   2253 	rev	r20, r20
   2254 #endif
   2255 	add	r0, r0, #64                      @ AES input_ptr update
   2256 	cmp	r0, r5                  @ check if we have <= 8 blocks
   2257 
   2258 	eor	r6, r6, r13                    @ AES block 0 - round 12 low
   2259 
   2260 	eor	r7, r7, r14                    @ AES block 0 - round 12 high
   2261 	eor	r22, r22, r14                    @ AES block 2 - round 12 high
   2262 	fmov	d4, r6                              @ AES block 0 - mov low
   2263 
   2264 	eor	r24, r24, r14                    @ AES block 3 - round 12 high
   2265 	fmov	v4.d[1], r7                          @ AES block 0 - mov high
   2266 
   2267 	eor	r21, r21, r13                    @ AES block 2 - round 12 low
   2268 	eor	r19, r19, r13                    @ AES block 1 - round 12 low
   2269 
   2270 	fmov	d5, r19                              @ AES block 1 - mov low
   2271 	eor	r20, r20, r14                    @ AES block 1 - round 12 high
   2272 
   2273 	fmov	v5.d[1], r20                          @ AES block 1 - mov high
   2274 
   2275 	eor	r23, r23, r13                    @ AES block 3 - round 12 low
   2276 	fmov	d6, r21                              @ AES block 2 - mov low
   2277 
   2278 	add	r12, r12, #1                           @ CTR block 4
   2279 	eor	q4, q4, q0                         @ AES block 0 - result
   2280 	fmov	d0, r10                              @ CTR block 4
   2281 
   2282 	fmov	v0.d[1], r9                              @ CTR block 4
   2283 	rev	r9, r12                                @ CTR block 5
   2284 
   2285 	orr	r9, r11, r9, lsl #32           @ CTR block 5
   2286 	add	r12, r12, #1                           @ CTR block 5
   2287 
   2288 	fmov	d7, r23                              @ AES block 3 - mov low
   2289 	st1	{ q4}, [r2], #16                    @ AES block 0 - store result
   2290 
   2291 	fmov	v6.d[1], r22                          @ AES block 2 - mov high
   2292 
   2293 	eor	q5, q5, q1                         @ AES block 1 - result
   2294 	fmov	d1, r10                              @ CTR block 5
   2295 	st1	{ q5}, [r2], #16                    @ AES block 1 - store result
   2296 
   2297 	fmov	v7.d[1], r24                          @ AES block 3 - mov high
   2298 
   2299 	fmov	v1.d[1], r9                              @ CTR block 5
   2300 	rev	r9, r12                                @ CTR block 6
   2301 
   2302 	orr	r9, r11, r9, lsl #32           @ CTR block 6
   2303 
   2304 	add	r12, r12, #1                           @ CTR block 6
   2305 	eor	q6, q6, q2                         @ AES block 2 - result
   2306 	fmov	d2, r10                              @ CTR block 6
   2307 
   2308 	fmov	v2.d[1], r9                              @ CTR block 6
   2309 	rev	r9, r12                                @ CTR block 7
   2310 
   2311 	orr	r9, r11, r9, lsl #32           @ CTR block 7
   2312 	st1	{ q6}, [r2], #16                    @ AES block 2 - store result
   2313 
   2314 	eor	q7, q7, q3                         @ AES block 3 - result
   2315 	st1	{ q7}, [r2], #16                    @ AES block 3 - store result
   2316 	bge	.L192_enc_prepretail                             @ do prepretail
   2317 
   2318 .L192_enc_main_loop:@ main loop start
   2319 	aese	q2, v18.16b
   2320 	aesmc	q2, q2         @ AES block 4k+6 - round 0
   2321 	rev64	q5, q5                                   @ GHASH block 4k+1 (t0 and t1 free)
   2322 
   2323 	aese	q1, v18.16b
   2324 	aesmc	q1, q1         @ AES block 4k+5 - round 0
   2325 	ldp	r19, r20, [r0, #16]          @ AES block 4k+5 - load plaintext
   2326 #ifdef __ARMEB__
   2327 	rev	r19, r19
   2328 	rev	r20, r20
   2329 #endif
   2330 	ext	v11.16b, v11.16b, v11.16b, #8                    @ PRE 0
   2331 	fmov	d3, r10                              @ CTR block 4k+3
   2332 	rev64	q4, q4                                   @ GHASH block 4k (only t0 is free)
   2333 
   2334 	aese	q2, v19.16b
   2335 	aesmc	q2, q2         @ AES block 4k+6 - round 1
   2336 	fmov	v3.d[1], r9                              @ CTR block 4k+3
   2337 
   2338 	pmull2	v30.1q, q5, v14.2d                         @ GHASH block 4k+1 - high
   2339 	rev64	q7, q7                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2340 	ldp	r21, r22, [r0, #32]          @ AES block 4k+6 - load plaintext
   2341 #ifdef __ARMEB__
   2342 	rev	r21, r21
   2343 	rev	r22, r22
   2344 #endif
   2345 	aese	q0, v18.16b
   2346 	aesmc	q0, q0         @ AES block 4k+4 - round 0
   2347 	ldp	r23, r24, [r0, #48]          @ AES block 4k+3 - load plaintext
   2348 #ifdef __ARMEB__
   2349 	rev	r23, r23
   2350 	rev	r24, r24
   2351 #endif
   2352 	pmull	v31.1q, q5, v14.1d                         @ GHASH block 4k+1 - low
   2353 	eor	q4, q4, v11.16b                          @ PRE 1
   2354 
   2355 	aese	q1, v19.16b
   2356 	aesmc	q1, q1         @ AES block 4k+5 - round 1
   2357 
   2358 	aese	q0, v19.16b
   2359 	aesmc	q0, q0         @ AES block 4k+4 - round 1
   2360 	rev64	q6, q6                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
   2361 
   2362 	aese	q3, v18.16b
   2363 	aesmc	q3, q3         @ AES block 4k+7 - round 0
   2364 	eor	r24, r24, r14                    @ AES block 4k+3 - round 12 high
   2365 
   2366 	pmull	v11.1q, q4, v15.1d                      @ GHASH block 4k - low
   2367 	mov	d8, v4.d[1]                                 @ GHASH block 4k - mid
   2368 
   2369 	aese	q0, v20.16b
   2370 	aesmc	q0, q0         @ AES block 4k+4 - round 2
   2371 
   2372 	aese	q3, v19.16b
   2373 	aesmc	q3, q3         @ AES block 4k+7 - round 1
   2374 	eor	r21, r21, r13                    @ AES block 4k+6 - round 12 low
   2375 
   2376 	eor	q8, q8, q4                         @ GHASH block 4k - mid
   2377 	eor	v11.16b, v11.16b, v31.16b                        @ GHASH block 4k+1 - low
   2378 
   2379 	aese	q0, v21.16b
   2380 	aesmc	q0, q0         @ AES block 4k+4 - round 3
   2381 	eor	r19, r19, r13                    @ AES block 4k+5 - round 12 low
   2382 
   2383 	aese	q1, v20.16b
   2384 	aesmc	q1, q1         @ AES block 4k+5 - round 2
   2385 	mov	d31, v6.d[1]                                 @ GHASH block 4k+2 - mid
   2386 
   2387 	pmull2	v9.1q, q4, v15.2d                      @ GHASH block 4k - high
   2388 	mov	d4, v5.d[1]                                 @ GHASH block 4k+1 - mid
   2389 
   2390 	aese	q2, v20.16b
   2391 	aesmc	q2, q2         @ AES block 4k+6 - round 2
   2392 
   2393 	aese	q1, v21.16b
   2394 	aesmc	q1, q1         @ AES block 4k+5 - round 3
   2395 
   2396 	mov	d10, v17.d[1]                              @ GHASH block 4k - mid
   2397 	eor	q9, q9, v30.16b                        @ GHASH block 4k+1 - high
   2398 
   2399 	aese	q3, v20.16b
   2400 	aesmc	q3, q3         @ AES block 4k+7 - round 2
   2401 	eor	v31.8b, v31.8b, q6                         @ GHASH block 4k+2 - mid
   2402 
   2403 	pmull2	v30.1q, q6, v13.2d                         @ GHASH block 4k+2 - high
   2404 
   2405 	aese	q0, v22.16b
   2406 	aesmc	q0, q0         @ AES block 4k+4 - round 4
   2407 	eor	q4, q4, q5                         @ GHASH block 4k+1 - mid
   2408 
   2409 	aese	q3, v21.16b
   2410 	aesmc	q3, q3         @ AES block 4k+7 - round 3
   2411 
   2412 	pmull2	v5.1q, q7, v12.2d                         @ GHASH block 4k+3 - high
   2413 	eor	r20, r20, r14                    @ AES block 4k+5 - round 12 high
   2414 	ins	v31.d[1], v31.d[0]                               @ GHASH block 4k+2 - mid
   2415 
   2416 	aese	q0, v23.16b
   2417 	aesmc	q0, q0         @ AES block 4k+4 - round 5
   2418 	add	r12, r12, #1                           @ CTR block 4k+3
   2419 
   2420 	aese	q3, v22.16b
   2421 	aesmc	q3, q3         @ AES block 4k+7 - round 4
   2422 	eor	q9, q9, v30.16b                        @ GHASH block 4k+2 - high
   2423 
   2424 	pmull	v4.1q, q4, v17.1d                         @ GHASH block 4k+1 - mid
   2425 	eor	r22, r22, r14                    @ AES block 4k+6 - round 12 high
   2426 
   2427 	pmull2	v31.1q, v31.2d, v16.2d                         @ GHASH block 4k+2 - mid
   2428 	eor	r23, r23, r13                    @ AES block 4k+3 - round 12 low
   2429 	mov	d30, v7.d[1]                                 @ GHASH block 4k+3 - mid
   2430 
   2431 	pmull	v10.1q, q8, v10.1d                     @ GHASH block 4k - mid
   2432 	rev	r9, r12                                @ CTR block 4k+8
   2433 
   2434 	pmull	v8.1q, q6, v13.1d                         @ GHASH block 4k+2 - low
   2435 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+8
   2436 
   2437 	aese	q2, v21.16b
   2438 	aesmc	q2, q2         @ AES block 4k+6 - round 3
   2439 	eor	v30.8b, v30.8b, q7                         @ GHASH block 4k+3 - mid
   2440 
   2441 	aese	q1, v22.16b
   2442 	aesmc	q1, q1         @ AES block 4k+5 - round 4
   2443 	ldp	r6, r7, [r0, #0]           @ AES block 4k+4 - load plaintext
   2444 #ifdef __ARMEB__
   2445 	rev	r6, r6
   2446 	rev	r7, r7
   2447 #endif
   2448 	aese	q0, v24.16b
   2449 	aesmc	q0, q0         @ AES block 4k+4 - round 6
   2450 	eor	v11.16b, v11.16b, q8                        @ GHASH block 4k+2 - low
   2451 
   2452 	aese	q2, v22.16b
   2453 	aesmc	q2, q2         @ AES block 4k+6 - round 4
   2454 	add	r0, r0, #64                      @ AES input_ptr update
   2455 
   2456 	aese	q1, v23.16b
   2457 	aesmc	q1, q1         @ AES block 4k+5 - round 5
   2458 	movi	q8, #0xc2
   2459 
   2460 	pmull	v6.1q, q7, v12.1d                         @ GHASH block 4k+3 - low
   2461 	eor	r7, r7, r14                    @ AES block 4k+4 - round 12 high
   2462 	eor	v10.16b, v10.16b, q4                        @ GHASH block 4k+1 - mid
   2463 
   2464 	aese	q2, v23.16b
   2465 	aesmc	q2, q2         @ AES block 4k+6 - round 5
   2466 	eor	r6, r6, r13                    @ AES block 4k+4 - round 12 low
   2467 
   2468 	aese	q1, v24.16b
   2469 	aesmc	q1, q1         @ AES block 4k+5 - round 6
   2470 	shl	d8, d8, #56              @ mod_constant
   2471 
   2472 	aese	q3, v23.16b
   2473 	aesmc	q3, q3         @ AES block 4k+7 - round 5
   2474 	eor	q9, q9, q5                        @ GHASH block 4k+3 - high
   2475 
   2476 	aese	q0, v25.16b
   2477 	aesmc	q0, q0         @ AES block 4k+4 - round 7
   2478 	fmov	d5, r19                              @ AES block 4k+5 - mov low
   2479 
   2480 	aese	q1, v25.16b
   2481 	aesmc	q1, q1         @ AES block 4k+5 - round 7
   2482 	eor	v10.16b, v10.16b, v31.16b                        @ GHASH block 4k+2 - mid
   2483 
   2484 	aese	q3, v24.16b
   2485 	aesmc	q3, q3         @ AES block 4k+7 - round 6
   2486 	fmov	v5.d[1], r20                          @ AES block 4k+5 - mov high
   2487 
   2488 	aese	q0, v26.16b
   2489 	aesmc	q0, q0         @ AES block 4k+4 - round 8
   2490 	eor	v11.16b, v11.16b, q6                        @ GHASH block 4k+3 - low
   2491 
   2492 	pmull	v30.1q, v30.1d, v16.1d                         @ GHASH block 4k+3 - mid
   2493 	cmp	r0, r5                  @ .LOOP CONTROL
   2494 	fmov	d4, r6                              @ AES block 4k+4 - mov low
   2495 
   2496 	aese	q2, v24.16b
   2497 	aesmc	q2, q2         @ AES block 4k+6 - round 6
   2498 	fmov	v4.d[1], r7                          @ AES block 4k+4 - mov high
   2499 
   2500 	aese	q1, v26.16b
   2501 	aesmc	q1, q1         @ AES block 4k+5 - round 8
   2502 	fmov	d7, r23                              @ AES block 4k+3 - mov low
   2503 
   2504 	eor	v10.16b, v10.16b, v30.16b                        @ GHASH block 4k+3 - mid
   2505 	eor	v30.16b, v11.16b, q9                        @ MODULO - karatsuba tidy up
   2506 	add	r12, r12, #1                           @ CTR block 4k+8
   2507 
   2508 	aese	q2, v25.16b
   2509 	aesmc	q2, q2         @ AES block 4k+6 - round 7
   2510 	fmov	v7.d[1], r24                          @ AES block 4k+3 - mov high
   2511 
   2512 	pmull	v31.1q, q9, q8           @ MODULO - top 64b align with mid
   2513 	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
   2514 	fmov	d6, r21                              @ AES block 4k+6 - mov low
   2515 
   2516 	aese	q3, v25.16b
   2517 	aesmc	q3, q3         @ AES block 4k+7 - round 7
   2518 
   2519 	aese	q0, v27.16b
   2520 	aesmc	q0, q0         @ AES block 4k+4 - round 9
   2521 	eor	v10.16b, v10.16b, v30.16b                        @ MODULO - karatsuba tidy up
   2522 
   2523 	aese	q2, v26.16b
   2524 	aesmc	q2, q2         @ AES block 4k+6 - round 8
   2525 
   2526 	aese	q3, v26.16b
   2527 	aesmc	q3, q3         @ AES block 4k+7 - round 8
   2528 
   2529 	aese	q1, v27.16b
   2530 	aesmc	q1, q1         @ AES block 4k+5 - round 9
   2531 
   2532 	aese	q0, v28.16b
   2533 	aesmc	q0, q0         @ AES block 4k+4 - round 10
   2534 	eor	v10.16b, v10.16b, v31.16b                     @ MODULO - fold into mid
   2535 
   2536 	aese	q3, v27.16b
   2537 	aesmc	q3, q3         @ AES block 4k+7 - round 9
   2538 
   2539 	aese	q2, v27.16b
   2540 	aesmc	q2, q2         @ AES block 4k+6 - round 9
   2541 
   2542 	aese	q0, v29.16b                                    @ AES block 4k+4 - round 11
   2543 
   2544 	aese	q1, v28.16b
   2545 	aesmc	q1, q1         @ AES block 4k+5 - round 10
   2546 	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
   2547 
   2548 	aese	q2, v28.16b
   2549 	aesmc	q2, q2         @ AES block 4k+6 - round 10
   2550 
   2551 	eor	q4, q4, q0                         @ AES block 4k+4 - result
   2552 	fmov	d0, r10                              @ CTR block 4k+8
   2553 
   2554 	aese	q1, v29.16b                                    @ AES block 4k+5 - round 11
   2555 	fmov	v0.d[1], r9                              @ CTR block 4k+8
   2556 	rev	r9, r12                                @ CTR block 4k+9
   2557 
   2558 	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
   2559 	fmov	v6.d[1], r22                          @ AES block 4k+6 - mov high
   2560 	st1	{ q4}, [r2], #16                    @ AES block 4k+4 - store result
   2561 
   2562 	aese	q3, v28.16b
   2563 	aesmc	q3, q3         @ AES block 4k+7 - round 10
   2564 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+9
   2565 
   2566 	eor	q5, q5, q1                         @ AES block 4k+5 - result
   2567 	add	r12, r12, #1                           @ CTR block 4k+9
   2568 	fmov	d1, r10                              @ CTR block 4k+9
   2569 
   2570 	aese	q2, v29.16b                                    @ AES block 4k+6 - round 11
   2571 	fmov	v1.d[1], r9                              @ CTR block 4k+9
   2572 	rev	r9, r12                                @ CTR block 4k+10
   2573 
   2574 	add	r12, r12, #1                           @ CTR block 4k+10
   2575 	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
   2576 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+10
   2577 
   2578 	st1	{ q5}, [r2], #16                    @ AES block 4k+5 - store result
   2579 	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
   2580 
   2581 	aese	q3, v29.16b                                    @ AES block 4k+7 - round 11
   2582 	eor	q6, q6, q2                         @ AES block 4k+6 - result
   2583 	fmov	d2, r10                              @ CTR block 4k+10
   2584 
   2585 	st1	{ q6}, [r2], #16                    @ AES block 4k+6 - store result
   2586 	fmov	v2.d[1], r9                              @ CTR block 4k+10
   2587 	rev	r9, r12                                @ CTR block 4k+11
   2588 
   2589 	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
   2590 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+11
   2591 
   2592 	eor	q7, q7, q3                         @ AES block 4k+3 - result
   2593 	st1	{ q7}, [r2], #16                    @ AES block 4k+3 - store result
   2594 	blt	.L192_enc_main_loop
   2595 
   2596 .L192_enc_prepretail:@ PREPRETAIL
   2597 	aese	q0, v18.16b
   2598 	aesmc	q0, q0         @ AES block 4k+4 - round 0
   2599 	rev64	q4, q4                                   @ GHASH block 4k (only t0 is free)
   2600 
   2601 	fmov	d3, r10                              @ CTR block 4k+3
   2602 	ext	v11.16b, v11.16b, v11.16b, #8                    @ PRE 0
   2603 	add	r12, r12, #1                           @ CTR block 4k+3
   2604 
   2605 	aese	q1, v18.16b
   2606 	aesmc	q1, q1         @ AES block 4k+5 - round 0
   2607 	rev64	q5, q5                                   @ GHASH block 4k+1 (t0 and t1 free)
   2608 
   2609 	aese	q2, v18.16b
   2610 	aesmc	q2, q2         @ AES block 4k+6 - round 0
   2611 
   2612 	fmov	v3.d[1], r9                              @ CTR block 4k+3
   2613 	eor	q4, q4, v11.16b                          @ PRE 1
   2614 	mov	d10, v17.d[1]                              @ GHASH block 4k - mid
   2615 
   2616 	aese	q1, v19.16b
   2617 	aesmc	q1, q1         @ AES block 4k+5 - round 1
   2618 	rev64	q6, q6                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
   2619 
   2620 	pmull2	v30.1q, q5, v14.2d                         @ GHASH block 4k+1 - high
   2621 
   2622 	pmull	v11.1q, q4, v15.1d                      @ GHASH block 4k - low
   2623 	mov	d8, v4.d[1]                                 @ GHASH block 4k - mid
   2624 
   2625 	pmull	v31.1q, q5, v14.1d                         @ GHASH block 4k+1 - low
   2626 	rev64	q7, q7                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2627 
   2628 	pmull2	v9.1q, q4, v15.2d                      @ GHASH block 4k - high
   2629 
   2630 	eor	q8, q8, q4                         @ GHASH block 4k - mid
   2631 	mov	d4, v5.d[1]                                 @ GHASH block 4k+1 - mid
   2632 
   2633 	eor	v11.16b, v11.16b, v31.16b                        @ GHASH block 4k+1 - low
   2634 	mov	d31, v6.d[1]                                 @ GHASH block 4k+2 - mid
   2635 
   2636 	aese	q3, v18.16b
   2637 	aesmc	q3, q3         @ AES block 4k+7 - round 0
   2638 	eor	q9, q9, v30.16b                        @ GHASH block 4k+1 - high
   2639 
   2640 	pmull2	v30.1q, q6, v13.2d                         @ GHASH block 4k+2 - high
   2641 
   2642 	eor	q4, q4, q5                         @ GHASH block 4k+1 - mid
   2643 	eor	v31.8b, v31.8b, q6                         @ GHASH block 4k+2 - mid
   2644 
   2645 	aese	q3, v19.16b
   2646 	aesmc	q3, q3         @ AES block 4k+7 - round 1
   2647 
   2648 	aese	q2, v19.16b
   2649 	aesmc	q2, q2         @ AES block 4k+6 - round 1
   2650 	eor	q9, q9, v30.16b                        @ GHASH block 4k+2 - high
   2651 
   2652 	aese	q0, v19.16b
   2653 	aesmc	q0, q0         @ AES block 4k+4 - round 1
   2654 
   2655 	aese	q1, v20.16b
   2656 	aesmc	q1, q1         @ AES block 4k+5 - round 2
   2657 	mov	d30, v7.d[1]                                 @ GHASH block 4k+3 - mid
   2658 
   2659 	pmull2	v5.1q, q7, v12.2d                         @ GHASH block 4k+3 - high
   2660 	ins	v31.d[1], v31.d[0]                               @ GHASH block 4k+2 - mid
   2661 
   2662 	aese	q0, v20.16b
   2663 	aesmc	q0, q0         @ AES block 4k+4 - round 2
   2664 
   2665 	pmull	v10.1q, q8, v10.1d                     @ GHASH block 4k - mid
   2666 	eor	v30.8b, v30.8b, q7                         @ GHASH block 4k+3 - mid
   2667 
   2668 	aese	q1, v21.16b
   2669 	aesmc	q1, q1         @ AES block 4k+5 - round 3
   2670 
   2671 	pmull2	v31.1q, v31.2d, v16.2d                         @ GHASH block 4k+2 - mid
   2672 
   2673 	pmull	v4.1q, q4, v17.1d                         @ GHASH block 4k+1 - mid
   2674 
   2675 	pmull	v30.1q, v30.1d, v16.1d                         @ GHASH block 4k+3 - mid
   2676 	eor	q9, q9, q5                        @ GHASH block 4k+3 - high
   2677 
   2678 	pmull	v8.1q, q6, v13.1d                         @ GHASH block 4k+2 - low
   2679 
   2680 	aese	q0, v21.16b
   2681 	aesmc	q0, q0         @ AES block 4k+4 - round 3
   2682 	eor	v10.16b, v10.16b, q4                        @ GHASH block 4k+1 - mid
   2683 
   2684 	aese	q3, v20.16b
   2685 	aesmc	q3, q3         @ AES block 4k+7 - round 2
   2686 
   2687 	aese	q2, v20.16b
   2688 	aesmc	q2, q2         @ AES block 4k+6 - round 2
   2689 	eor	v11.16b, v11.16b, q8                        @ GHASH block 4k+2 - low
   2690 
   2691 	aese	q0, v22.16b
   2692 	aesmc	q0, q0         @ AES block 4k+4 - round 4
   2693 
   2694 	aese	q3, v21.16b
   2695 	aesmc	q3, q3         @ AES block 4k+7 - round 3
   2696 	eor	v10.16b, v10.16b, v31.16b                        @ GHASH block 4k+2 - mid
   2697 
   2698 	aese	q2, v21.16b
   2699 	aesmc	q2, q2         @ AES block 4k+6 - round 3
   2700 
   2701 	pmull	v6.1q, q7, v12.1d                         @ GHASH block 4k+3 - low
   2702 	movi	q8, #0xc2
   2703 
   2704 	aese	q3, v22.16b
   2705 	aesmc	q3, q3         @ AES block 4k+7 - round 4
   2706 
   2707 	aese	q2, v22.16b
   2708 	aesmc	q2, q2         @ AES block 4k+6 - round 4
   2709 
   2710 	aese	q1, v22.16b
   2711 	aesmc	q1, q1         @ AES block 4k+5 - round 4
   2712 	eor	v10.16b, v10.16b, v30.16b                        @ GHASH block 4k+3 - mid
   2713 
   2714 	aese	q3, v23.16b
   2715 	aesmc	q3, q3         @ AES block 4k+7 - round 5
   2716 
   2717 	aese	q2, v23.16b
   2718 	aesmc	q2, q2         @ AES block 4k+6 - round 5
   2719 
   2720 	aese	q1, v23.16b
   2721 	aesmc	q1, q1         @ AES block 4k+5 - round 5
   2722 	eor	v11.16b, v11.16b, q6                        @ GHASH block 4k+3 - low
   2723 
   2724 	aese	q0, v23.16b
   2725 	aesmc	q0, q0         @ AES block 4k+4 - round 5
   2726 
   2727 	aese	q3, v24.16b
   2728 	aesmc	q3, q3         @ AES block 4k+7 - round 6
   2729 	eor	v10.16b, v10.16b, q9                        @ karatsuba tidy up
   2730 
   2731 	aese	q1, v24.16b
   2732 	aesmc	q1, q1         @ AES block 4k+5 - round 6
   2733 
   2734 	aese	q0, v24.16b
   2735 	aesmc	q0, q0         @ AES block 4k+4 - round 6
   2736 	shl	d8, d8, #56              @ mod_constant
   2737 
   2738 	aese	q3, v25.16b
   2739 	aesmc	q3, q3         @ AES block 4k+7 - round 7
   2740 
   2741 	aese	q1, v25.16b
   2742 	aesmc	q1, q1         @ AES block 4k+5 - round 7
   2743 	eor	v10.16b, v10.16b, v11.16b
   2744 
   2745 	aese	q0, v25.16b
   2746 	aesmc	q0, q0         @ AES block 4k+4 - round 7
   2747 
   2748 	pmull	v30.1q, q9, q8
   2749 
   2750 	aese	q2, v24.16b
   2751 	aesmc	q2, q2         @ AES block 4k+6 - round 6
   2752 	ext	q9, q9, q9, #8
   2753 
   2754 	aese	q0, v26.16b
   2755 	aesmc	q0, q0         @ AES block 4k+4 - round 8
   2756 
   2757 	aese	q1, v26.16b
   2758 	aesmc	q1, q1         @ AES block 4k+5 - round 8
   2759 	eor	v10.16b, v10.16b, v30.16b
   2760 
   2761 	aese	q2, v25.16b
   2762 	aesmc	q2, q2         @ AES block 4k+6 - round 7
   2763 
   2764 	aese	q3, v26.16b
   2765 	aesmc	q3, q3         @ AES block 4k+7 - round 8
   2766 
   2767 	aese	q0, v27.16b
   2768 	aesmc	q0, q0         @ AES block 4k+4 - round 9
   2769 
   2770 	aese	q2, v26.16b
   2771 	aesmc	q2, q2         @ AES block 4k+6 - round 8
   2772 	eor	v10.16b, v10.16b, q9
   2773 
   2774 	aese	q3, v27.16b
   2775 	aesmc	q3, q3         @ AES block 4k+7 - round 9
   2776 
   2777 	aese	q1, v27.16b
   2778 	aesmc	q1, q1         @ AES block 4k+5 - round 9
   2779 
   2780 	aese	q2, v27.16b
   2781 	aesmc	q2, q2         @ AES block 4k+6 - round 9
   2782 
   2783 	pmull	v30.1q, v10.1d, q8
   2784 
   2785 	ext	v10.16b, v10.16b, v10.16b, #8
   2786 
   2787 	aese	q3, v28.16b
   2788 	aesmc	q3, q3         @ AES block 4k+7 - round 10
   2789 
   2790 	aese	q0, v28.16b
   2791 	aesmc	q0, q0         @ AES block 4k+4 - round 10
   2792 
   2793 	aese	q2, v28.16b
   2794 	aesmc	q2, q2         @ AES block 4k+6 - round 10
   2795 
   2796 	aese	q1, v28.16b
   2797 	aesmc	q1, q1         @ AES block 4k+5 - round 10
   2798 	eor	v11.16b, v11.16b, v30.16b
   2799 
   2800 	aese	q0, v29.16b                                    @ AES block 4k+4 - round 11
   2801 
   2802 	aese	q3, v29.16b                                    @ AES block 4k+7 - round 11
   2803 
   2804 	aese	q2, v29.16b                                    @ AES block 4k+6 - round 11
   2805 
   2806 	aese	q1, v29.16b                                    @ AES block 4k+5 - round 11
   2807 	eor	v11.16b, v11.16b, v10.16b
   2808 .L192_enc_tail:@ TAIL
   2809 
   2810 	sub	r5, r4, r0  @ main_end_input_ptr is number of bytes left to process
   2811 	ldp	r6, r7, [r0], #16          @ AES block 4k+4 - load plaintext
   2812 #ifdef __ARMEB__
   2813 	rev	r6, r6
   2814 	rev	r7, r7
   2815 #endif
   2816 	eor	r6, r6, r13                    @ AES block 4k+4 - round 12 low
   2817 	eor	r7, r7, r14                    @ AES block 4k+4 - round 12 high
   2818 
   2819 	fmov	d4, r6                              @ AES block 4k+4 - mov low
   2820 
   2821 	fmov	v4.d[1], r7                          @ AES block 4k+4 - mov high
   2822 	cmp	r5, #48
   2823 
   2824 	eor	q5, q4, q0                         @ AES block 4k+4 - result
   2825 
   2826 	ext	q8, v11.16b, v11.16b, #8                    @ prepare final partial tag
   2827 	bgt	.L192_enc_blocks_more_than_3
   2828 
   2829 	sub	r12, r12, #1
   2830 	movi	v10.8b, #0
   2831 
   2832 	mov	q3, q2
   2833 	movi	q9, #0
   2834 	cmp	r5, #32
   2835 
   2836 	mov	q2, q1
   2837 	movi	v11.8b, #0
   2838 	bgt	.L192_enc_blocks_more_than_2
   2839 
   2840 	sub	r12, r12, #1
   2841 
   2842 	mov	q3, q1
   2843 	cmp	r5, #16
   2844 	bgt	.L192_enc_blocks_more_than_1
   2845 
   2846 	sub	r12, r12, #1
   2847 	b	.L192_enc_blocks_less_than_1
   2848 .L192_enc_blocks_more_than_3:@ blocks left >  3
   2849 	st1	{ q5}, [r2], #16                    @ AES final-3 block  - store result
   2850 
   2851 	ldp	r6, r7, [r0], #16          @ AES final-2 block - load input low & high
   2852 #ifdef __ARMEB__
   2853 	rev	r6, r6
   2854 	rev	r7, r7
   2855 #endif
   2856 	rev64	q4, q5                                   @ GHASH final-3 block
   2857 
   2858 	eor	r6, r6, r13                    @ AES final-2 block - round 12 low
   2859 	eor	q4, q4, q8                          @ feed in partial tag
   2860 
   2861 	eor	r7, r7, r14                    @ AES final-2 block - round 12 high
   2862 	fmov	d5, r6                                @ AES final-2 block - mov low
   2863 
   2864 	fmov	v5.d[1], r7                            @ AES final-2 block - mov high
   2865 
   2866 	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
   2867 
   2868 	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
   2869 
   2870 	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
   2871 
   2872 	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
   2873 
   2874 	movi	q8, #0                                       @ suppress further partial tag feed in
   2875 
   2876 	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
   2877 
   2878 	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
   2879 	eor	q5, q5, q1                           @ AES final-2 block - result
   2880 .L192_enc_blocks_more_than_2:@ blocks left >  2
   2881 
   2882 	st1	{ q5}, [r2], #16                    @ AES final-2 block - store result
   2883 
   2884 	rev64	q4, q5                                   @ GHASH final-2 block
   2885 	ldp	r6, r7, [r0], #16          @ AES final-1 block - load input low & high
   2886 #ifdef __ARMEB__
   2887 	rev	r6, r6
   2888 	rev	r7, r7
   2889 #endif
   2890 	eor	q4, q4, q8                          @ feed in partial tag
   2891 
   2892 	eor	r7, r7, r14                    @ AES final-1 block - round 12 high
   2893 
   2894 	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
   2895 	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
   2896 
   2897 	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
   2898 	eor	r6, r6, r13                    @ AES final-1 block - round 12 low
   2899 
   2900 	fmov	d5, r6                                @ AES final-1 block - mov low
   2901 
   2902 	fmov	v5.d[1], r7                            @ AES final-1 block - mov high
   2903 	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
   2904 	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
   2905 
   2906 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
   2907 
   2908 	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
   2909 
   2910 	movi	q8, #0                                       @ suppress further partial tag feed in
   2911 
   2912 	eor	q5, q5, q2                           @ AES final-1 block - result
   2913 
   2914 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
   2915 .L192_enc_blocks_more_than_1:@ blocks left >  1
   2916 
   2917 	st1	{ q5}, [r2], #16                    @ AES final-1 block - store result
   2918 
   2919 	ldp	r6, r7, [r0], #16          @ AES final block - load input low & high
   2920 #ifdef __ARMEB__
   2921 	rev	r6, r6
   2922 	rev	r7, r7
   2923 #endif
   2924 	rev64	q4, q5                                   @ GHASH final-1 block
   2925 
   2926 	eor	r6, r6, r13                    @ AES final block - round 12 low
   2927 	eor	q4, q4, q8                          @ feed in partial tag
   2928 	movi	q8, #0                                       @ suppress further partial tag feed in
   2929 
   2930 	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
   2931 
   2932 	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
   2933 	eor	r7, r7, r14                    @ AES final block - round 12 high
   2934 	fmov	d5, r6                                @ AES final block - mov low
   2935 
   2936 	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
   2937 	fmov	v5.d[1], r7                            @ AES final block - mov high
   2938 
   2939 	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
   2940 
   2941 	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
   2942 
   2943 	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
   2944 
   2945 	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
   2946 
   2947 	eor	q5, q5, q3                           @ AES final block - result
   2948 
   2949 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
   2950 
   2951 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
   2952 .L192_enc_blocks_less_than_1:@ blocks left <= 1
   2953 
   2954 	ld1	{ v18.16b}, [r2]                           @ load existing bytes where the possibly partial last block is to be stored
   2955 #ifndef __ARMEB__
   2956 	rev	r9, r12
   2957 #else
   2958 	mov	r9, r12
   2959 #endif
   2960 	and	r1, r1, #127                   @ bit_length %= 128
   2961 
   2962 	sub	r1, r1, #128                   @ bit_length -= 128
   2963 	mvn	r14, xzr                                     @ rk12_h = 0xffffffffffffffff
   2964 
   2965 	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
   2966 	mvn	r13, xzr                                     @ rk12_l = 0xffffffffffffffff
   2967 
   2968 	and	r1, r1, #127                   @ bit_length %= 128
   2969 
   2970 	lsr	r14, r14, r1                    @ rk12_h is mask for top 64b of last block
   2971 	cmp	r1, #64
   2972 
   2973 	csel	r6, r13, r14, lt
   2974 	csel	r7, r14, xzr, lt
   2975 
   2976 	fmov	d0, r6                                @ ctr0b is mask for last block
   2977 
   2978 	fmov	v0.d[1], r7
   2979 
   2980 	and	q5, q5, q0                           @ possibly partial last block has zeroes in highest bits
   2981 
   2982 	rev64	q4, q5                                   @ GHASH final block
   2983 
   2984 	eor	q4, q4, q8                          @ feed in partial tag
   2985 
   2986 	mov	d8, v4.d[1]                                 @ GHASH final block - mid
   2987 
   2988 	pmull	v21.1q, q4, v12.1d                         @ GHASH final block - low
   2989 
   2990 	pmull2	v20.1q, q4, v12.2d                         @ GHASH final block - high
   2991 
   2992 	eor	q8, q8, q4                         @ GHASH final block - mid
   2993 
   2994 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final block - low
   2995 
   2996 	eor	q9, q9, v20.16b                           @ GHASH final block - high
   2997 
   2998 	pmull	v8.1q, q8, v16.1d                         @ GHASH final block - mid
   2999 
   3000 	eor	v10.16b, v10.16b, q8                        @ GHASH final block - mid
   3001 	movi	q8, #0xc2
   3002 
   3003 	eor	v30.16b, v11.16b, q9                        @ MODULO - karatsuba tidy up
   3004 
   3005 	shl	d8, d8, #56              @ mod_constant
   3006 
   3007 	bif	q5, v18.16b, q0                             @ insert existing bytes in top end of result before storing
   3008 
   3009 	eor	v10.16b, v10.16b, v30.16b                        @ MODULO - karatsuba tidy up
   3010 
   3011 	pmull	v31.1q, q9, q8           @ MODULO - top 64b align with mid
   3012 
   3013 	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
   3014 
   3015 	eor	v10.16b, v10.16b, v31.16b                     @ MODULO - fold into mid
   3016 
   3017 	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
   3018 
   3019 	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
   3020 
   3021 	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
   3022 
   3023 	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
   3024 	str	r9, [r16, #12]                         @ store the updated counter
   3025 
   3026 	st1	{ q5}, [r2]                         @ store all 16B
   3027 
   3028 	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
   3029 	ext	v11.16b, v11.16b, v11.16b, #8
   3030 	rev64	v11.16b, v11.16b
   3031 	mov	r0, r15
   3032 	st1	{ v11.16b }, [r3]
   3033 
   3034 	ldp	r21, r22, [sp, #16]
   3035 	ldp	r23, r24, [sp, #32]
   3036 	ldp	d8, d9, [sp, #48]
   3037 	ldp	d10, d11, [sp, #64]
   3038 	ldp	d12, d13, [sp, #80]
   3039 	ldp	d14, d15, [sp, #96]
   3040 	ldp	r19, r20, [sp], #112
   3041 	RET
   3042 
   3043 .L192_enc_ret:
   3044 	mov	r0, #0x0
   3045 	RET
   3046 .size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
   3047 .globl	aes_gcm_dec_192_kernel
   3048 .type	aes_gcm_dec_192_kernel,%function
   3049 .align	4
   3050 aes_gcm_dec_192_kernel:
   3051 	cbz	r1, .L192_dec_ret
   3052 	stp	r19, r20, [sp, #-112]!
   3053 	mov	r16, r4
   3054 	mov	r8, r5
   3055 	stp	r21, r22, [sp, #16]
   3056 	stp	r23, r24, [sp, #32]
   3057 	stp	d8, d9, [sp, #48]
   3058 	stp	d10, d11, [sp, #64]
   3059 	stp	d12, d13, [sp, #80]
   3060 	stp	d14, d15, [sp, #96]
   3061 
   3062 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   3063 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   3064 #ifdef __ARMEB__
   3065 	rev	r10, r10
   3066 	rev	r11, r11
   3067 #endif
   3068 	ldp	r13, r14, [r8, #192]                     @ load rk12
   3069 #ifdef __ARMEB__
   3070 	ror	r13, r13, #32
   3071 	ror	r14, r14, #32
   3072 #endif
   3073 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   3074 
   3075 	ld1	{v18.4s}, [r8], #16                                  @ load rk0
   3076 
   3077 	lsr	r5, r1, #3              @ byte_len
   3078 	mov	r15, r5
   3079 	ld1	{v19.4s}, [r8], #16                               @ load rk1
   3080 
   3081 	lsr	r12, r11, #32
   3082 	orr	r11, r11, r11
   3083 	fmov	d3, r10                               @ CTR block 3
   3084 
   3085 	rev	r12, r12                                @ rev_ctr32
   3086 	fmov	d1, r10                               @ CTR block 1
   3087 
   3088 	add	r12, r12, #1                            @ increment rev_ctr32
   3089 	ld1	{v20.4s}, [r8], #16                               @ load rk2
   3090 
   3091 	aese	q0, v18.16b
   3092 	aesmc	q0, q0          @ AES block 0 - round 0
   3093 	rev	r9, r12                                 @ CTR block 1
   3094 
   3095 	add	r12, r12, #1                            @ CTR block 1
   3096 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   3097 	ld1	{v21.4s}, [r8], #16                               @ load rk3
   3098 
   3099 	fmov	v1.d[1], r9                               @ CTR block 1
   3100 	rev	r9, r12                                 @ CTR block 2
   3101 	add	r12, r12, #1                            @ CTR block 2
   3102 
   3103 	fmov	d2, r10                               @ CTR block 2
   3104 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   3105 
   3106 	fmov	v2.d[1], r9                               @ CTR block 2
   3107 	rev	r9, r12                                 @ CTR block 3
   3108 
   3109 	aese	q0, v19.16b
   3110 	aesmc	q0, q0          @ AES block 0 - round 1
   3111 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   3112 
   3113 	fmov	v3.d[1], r9                               @ CTR block 3
   3114 
   3115 	ld1	{v22.4s}, [r8], #16                               @ load rk4
   3116 
   3117 	aese	q0, v20.16b
   3118 	aesmc	q0, q0          @ AES block 0 - round 2
   3119 
   3120 	aese	q2, v18.16b
   3121 	aesmc	q2, q2          @ AES block 2 - round 0
   3122 	ld1	{v23.4s}, [r8], #16                               @ load rk5
   3123 
   3124 	aese	q1, v18.16b
   3125 	aesmc	q1, q1          @ AES block 1 - round 0
   3126 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   3127 #ifndef __ARMEB__
   3128 	ext	v15.16b, v15.16b, v15.16b, #8
   3129 #endif
   3130 	aese	q3, v18.16b
   3131 	aesmc	q3, q3          @ AES block 3 - round 0
   3132 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   3133 #ifndef __ARMEB__
   3134 	ext	v13.16b, v13.16b, v13.16b, #8
   3135 #endif
   3136 	aese	q2, v19.16b
   3137 	aesmc	q2, q2          @ AES block 2 - round 1
   3138 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   3139 #ifndef __ARMEB__
   3140 	ext	v14.16b, v14.16b, v14.16b, #8
   3141 #endif
   3142 	aese	q1, v19.16b
   3143 	aesmc	q1, q1          @ AES block 1 - round 1
   3144 
   3145 	aese	q3, v19.16b
   3146 	aesmc	q3, q3          @ AES block 3 - round 1
   3147 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   3148 #ifndef __ARMEB__
   3149 	ext	v12.16b, v12.16b, v12.16b, #8
   3150 #endif
   3151 	aese	q2, v20.16b
   3152 	aesmc	q2, q2          @ AES block 2 - round 2
   3153 	ld1	{v24.4s}, [r8], #16                               @ load rk6
   3154 
   3155 	aese	q0, v21.16b
   3156 	aesmc	q0, q0          @ AES block 0 - round 3
   3157 	ld1	{v25.4s}, [r8], #16                               @ load rk7
   3158 
   3159 	aese	q1, v20.16b
   3160 	aesmc	q1, q1          @ AES block 1 - round 2
   3161 	ld1	{v26.4s}, [r8], #16                               @ load rk8
   3162 
   3163 	aese	q3, v20.16b
   3164 	aesmc	q3, q3          @ AES block 3 - round 2
   3165 	ld1	{v27.4s}, [r8], #16                               @ load rk9
   3166 
   3167 	aese	q2, v21.16b
   3168 	aesmc	q2, q2          @ AES block 2 - round 3
   3169 	ld1	{ v11.16b}, [r3]
   3170 	ext	v11.16b, v11.16b, v11.16b, #8
   3171 	rev64	v11.16b, v11.16b
   3172 
   3173 	aese	q1, v21.16b
   3174 	aesmc	q1, q1          @ AES block 1 - round 3
   3175 	add	r12, r12, #1                            @ CTR block 3
   3176 
   3177 	aese	q3, v21.16b
   3178 	aesmc	q3, q3          @ AES block 3 - round 3
   3179 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   3180 
   3181 	aese	q0, v22.16b
   3182 	aesmc	q0, q0          @ AES block 0 - round 4
   3183 	ld1	{v28.4s}, [r8], #16                              @ load rk10
   3184 
   3185 	aese	q1, v22.16b
   3186 	aesmc	q1, q1          @ AES block 1 - round 4
   3187 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   3188 
   3189 	aese	q2, v22.16b
   3190 	aesmc	q2, q2          @ AES block 2 - round 4
   3191 
   3192 	aese	q3, v22.16b
   3193 	aesmc	q3, q3          @ AES block 3 - round 4
   3194 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   3195 
   3196 	aese	q0, v23.16b
   3197 	aesmc	q0, q0          @ AES block 0 - round 5
   3198 	ld1	{v29.4s}, [r8], #16                              @ load rk11
   3199 
   3200 	aese	q1, v23.16b
   3201 	aesmc	q1, q1          @ AES block 1 - round 5
   3202 
   3203 	aese	q2, v23.16b
   3204 	aesmc	q2, q2          @ AES block 2 - round 5
   3205 
   3206 	aese	q3, v23.16b
   3207 	aesmc	q3, q3          @ AES block 3 - round 5
   3208 
   3209 	aese	q0, v24.16b
   3210 	aesmc	q0, q0          @ AES block 0 - round 6
   3211 
   3212 	aese	q2, v24.16b
   3213 	aesmc	q2, q2          @ AES block 2 - round 6
   3214 
   3215 	aese	q3, v24.16b
   3216 	aesmc	q3, q3          @ AES block 3 - round 6
   3217 
   3218 	aese	q0, v25.16b
   3219 	aesmc	q0, q0          @ AES block 0 - round 7
   3220 
   3221 	aese	q2, v25.16b
   3222 	aesmc	q2, q2          @ AES block 2 - round 7
   3223 
   3224 	aese	q3, v25.16b
   3225 	aesmc	q3, q3          @ AES block 3 - round 7
   3226 
   3227 	aese	q1, v24.16b
   3228 	aesmc	q1, q1          @ AES block 1 - round 6
   3229 
   3230 	aese	q2, v26.16b
   3231 	aesmc	q2, q2          @ AES block 2 - round 8
   3232 
   3233 	aese	q3, v26.16b
   3234 	aesmc	q3, q3          @ AES block 3 - round 8
   3235 
   3236 	aese	q1, v25.16b
   3237 	aesmc	q1, q1          @ AES block 1 - round 7
   3238 
   3239 	aese	q2, v27.16b
   3240 	aesmc	q2, q2          @ AES block 2 - round 9
   3241 
   3242 	aese	q3, v27.16b
   3243 	aesmc	q3, q3          @ AES block 3 - round 9
   3244 
   3245 	aese	q1, v26.16b
   3246 	aesmc	q1, q1          @ AES block 1 - round 8
   3247 	sub	r5, r5, #1      @ byte_len - 1
   3248 
   3249 	aese	q0, v26.16b
   3250 	aesmc	q0, q0          @ AES block 0 - round 8
   3251 	and	r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   3252 
   3253 	aese	q3, v28.16b
   3254 	aesmc	q3, q3          @ AES block 3 - round 10
   3255 	add	r5, r5, r0
   3256 
   3257 	aese	q1, v27.16b
   3258 	aesmc	q1, q1          @ AES block 1 - round 9
   3259 	cmp	r0, r5                   @ check if we have <= 4 blocks
   3260 
   3261 	aese	q0, v27.16b
   3262 	aesmc	q0, q0          @ AES block 0 - round 9
   3263 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   3264 
   3265 	aese	q3, v29.16b                                     @ AES block 3 - round 11
   3266 
   3267 	aese	q2, v28.16b
   3268 	aesmc	q2, q2          @ AES block 2 - round 10
   3269 
   3270 	aese	q1, v28.16b
   3271 	aesmc	q1, q1          @ AES block 1 - round 10
   3272 
   3273 	aese	q0, v28.16b
   3274 	aesmc	q0, q0          @ AES block 0 - round 10
   3275 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   3276 
   3277 	aese	q2, v29.16b                                     @ AES block 2 - round 11
   3278 
   3279 	aese	q1, v29.16b                                     @ AES block 1 - round 11
   3280 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   3281 
   3282 	aese	q0, v29.16b                                     @ AES block 0 - round 11
   3283 	bge	.L192_dec_tail                                    @ handle tail
   3284 
   3285 	ld1	{q4, q5}, [r0], #32               @ AES block 0,1 - load ciphertext
   3286 
   3287 	eor	q1, q5, q1                            @ AES block 1 - result
   3288 
   3289 	eor	q0, q4, q0                            @ AES block 0 - result
   3290 	rev	r9, r12                                 @ CTR block 4
   3291 	ld1	{q6, q7}, [r0], #32               @ AES block 2,3 - load ciphertext
   3292 
   3293 	mov	r19, v1.d[0]                            @ AES block 1 - mov low
   3294 
   3295 	mov	r20, v1.d[1]                            @ AES block 1 - mov high
   3296 
   3297 	mov	r6, v0.d[0]                            @ AES block 0 - mov low
   3298 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   3299 	add	r12, r12, #1                            @ CTR block 4
   3300 
   3301 	mov	r7, v0.d[1]                            @ AES block 0 - mov high
   3302 	rev64	q4, q4                                    @ GHASH block 0
   3303 
   3304 	fmov	d0, r10                               @ CTR block 4
   3305 	rev64	q5, q5                                    @ GHASH block 1
   3306 	cmp	r0, r5                   @ check if we have <= 8 blocks
   3307 
   3308 	eor	r19, r19, r13                   @ AES block 1 - round 12 low
   3309 #ifdef __ARMEB__
   3310 	rev	r19, r19
   3311 #endif
   3312 	fmov	v0.d[1], r9                               @ CTR block 4
   3313 	rev	r9, r12                                 @ CTR block 5
   3314 
   3315 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   3316 	fmov	d1, r10                               @ CTR block 5
   3317 	eor	r20, r20, r14                   @ AES block 1 - round 12 high
   3318 #ifdef __ARMEB__
   3319 	rev	r20, r20
   3320 #endif
   3321 	add	r12, r12, #1                            @ CTR block 5
   3322 	fmov	v1.d[1], r9                               @ CTR block 5
   3323 	eor	r6, r6, r13                   @ AES block 0 - round 12 low
   3324 #ifdef __ARMEB__
   3325 	rev	r6, r6
   3326 #endif
   3327 	rev	r9, r12                                 @ CTR block 6
   3328 	eor	r7, r7, r14                   @ AES block 0 - round 12 high
   3329 #ifdef __ARMEB__
   3330 	rev	r7, r7
   3331 #endif
   3332 	stp	r6, r7, [r2], #16        @ AES block 0 - store result
   3333 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   3334 
   3335 	stp	r19, r20, [r2], #16        @ AES block 1 - store result
   3336 
   3337 	add	r12, r12, #1                            @ CTR block 6
   3338 	eor	q2, q6, q2                            @ AES block 2 - result
   3339 	bge	.L192_dec_prepretail                              @ do prepretail
   3340 
   3341 .L192_dec_main_loop:@ main loop start
   3342 	aese	q1, v18.16b
   3343 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   3344 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   3345 
   3346 	pmull	v31.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   3347 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   3348 
   3349 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   3350 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   3351 	rev64	q7, q7                                    @ GHASH block 4k+3
   3352 
   3353 	aese	q1, v19.16b
   3354 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   3355 	fmov	d2, r10                               @ CTR block 4k+6
   3356 
   3357 	aese	q0, v18.16b
   3358 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   3359 	eor	q4, q4, v11.16b                           @ PRE 1
   3360 
   3361 	pmull2	v30.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   3362 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   3363 
   3364 	aese	q1, v20.16b
   3365 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   3366 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   3367 
   3368 	aese	q0, v19.16b
   3369 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   3370 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   3371 
   3372 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   3373 	fmov	d3, r10                               @ CTR block 4k+7
   3374 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   3375 
   3376 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   3377 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   3378 	rev	r9, r12                                 @ CTR block 4k+7
   3379 
   3380 	aese	q2, v18.16b
   3381 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   3382 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   3383 
   3384 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   3385 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   3386 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   3387 
   3388 	aese	q1, v21.16b
   3389 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   3390 
   3391 	aese	q0, v20.16b
   3392 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   3393 	eor	r22, r22, r14                   @ AES block 4k+2 - round 12 high
   3394 #ifdef __ARMEB__
   3395 	rev	r22, r22
   3396 #endif
   3397 	aese	q2, v19.16b
   3398 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   3399 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   3400 
   3401 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   3402 
   3403 	aese	q3, v18.16b
   3404 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   3405 	rev64	q6, q6                                    @ GHASH block 4k+2
   3406 
   3407 	aese	q2, v20.16b
   3408 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   3409 
   3410 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   3411 	eor	v11.16b, v11.16b, v31.16b                         @ GHASH block 4k+1 - low
   3412 	eor	r21, r21, r13                   @ AES block 4k+2 - round 12 low
   3413 #ifdef __ARMEB__
   3414 	rev	r21, r21
   3415 #endif
   3416 	aese	q1, v22.16b
   3417 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   3418 
   3419 	aese	q0, v21.16b
   3420 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   3421 
   3422 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   3423 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   3424 
   3425 	aese	q3, v19.16b
   3426 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   3427 	eor	q9, q9, v30.16b                         @ GHASH block 4k+1 - high
   3428 
   3429 	aese	q0, v22.16b
   3430 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   3431 
   3432 	pmull2	v30.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   3433 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   3434 
   3435 	pmull	v8.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   3436 
   3437 	aese	q0, v23.16b
   3438 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   3439 
   3440 	eor	q9, q9, v30.16b                         @ GHASH block 4k+2 - high
   3441 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   3442 
   3443 	aese	q1, v23.16b
   3444 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   3445 
   3446 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   3447 
   3448 	aese	q3, v20.16b
   3449 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   3450 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   3451 
   3452 	aese	q1, v24.16b
   3453 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   3454 
   3455 	aese	q0, v24.16b
   3456 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   3457 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   3458 
   3459 	aese	q3, v21.16b
   3460 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   3461 
   3462 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   3463 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+2 - low
   3464 
   3465 	aese	q0, v25.16b
   3466 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   3467 
   3468 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   3469 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   3470 
   3471 	aese	q1, v25.16b
   3472 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   3473 
   3474 	aese	q0, v26.16b
   3475 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   3476 	movi	q8, #0xc2
   3477 
   3478 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   3479 
   3480 	aese	q1, v26.16b
   3481 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   3482 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   3483 
   3484 	aese	q2, v21.16b
   3485 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   3486 
   3487 	aese	q0, v27.16b
   3488 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   3489 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   3490 
   3491 	aese	q3, v22.16b
   3492 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   3493 
   3494 	aese	q2, v22.16b
   3495 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   3496 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   3497 
   3498 	aese	q0, v28.16b
   3499 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   3500 
   3501 	aese	q1, v27.16b
   3502 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   3503 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   3504 
   3505 	aese	q2, v23.16b
   3506 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   3507 
   3508 	aese	q3, v23.16b
   3509 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   3510 	shl	d8, d8, #56               @ mod_constant
   3511 
   3512 	aese	q1, v28.16b
   3513 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   3514 
   3515 	aese	q2, v24.16b
   3516 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   3517 	ld1	{q4}, [r0], #16                       @ AES block 4k+4 - load ciphertext
   3518 
   3519 	aese	q3, v24.16b
   3520 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   3521 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   3522 
   3523 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   3524 	ld1	{q5}, [r0], #16                       @ AES block 4k+5 - load ciphertext
   3525 	eor	r23, r23, r13                   @ AES block 4k+3 - round 12 low
   3526 #ifdef __ARMEB__
   3527 	rev	r23, r23
   3528 #endif
   3529 	aese	q2, v25.16b
   3530 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   3531 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   3532 
   3533 	aese	q0, v29.16b                                     @ AES block 4k+4 - round 11
   3534 	add	r12, r12, #1                            @ CTR block 4k+7
   3535 
   3536 	aese	q3, v25.16b
   3537 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   3538 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   3539 
   3540 	aese	q2, v26.16b
   3541 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   3542 	ld1	{q6}, [r0], #16                       @ AES block 4k+6 - load ciphertext
   3543 
   3544 	aese	q1, v29.16b                                     @ AES block 4k+5 - round 11
   3545 	ld1	{q7}, [r0], #16                       @ AES block 4k+7 - load ciphertext
   3546 	rev	r9, r12                                 @ CTR block 4k+8
   3547 
   3548 	aese	q3, v26.16b
   3549 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   3550 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   3551 
   3552 	aese	q2, v27.16b
   3553 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   3554 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   3555 
   3556 	cmp	r0, r5                   @ .LOOP CONTROL
   3557 
   3558 	eor	q0, q4, q0                            @ AES block 4k+4 - result
   3559 	eor	r24, r24, r14                   @ AES block 4k+3 - round 12 high
   3560 #ifdef __ARMEB__
   3561 	rev	r24, r24
   3562 #endif
   3563 	eor	q1, q5, q1                            @ AES block 4k+5 - result
   3564 
   3565 	aese	q2, v28.16b
   3566 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   3567 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   3568 
   3569 	aese	q3, v27.16b
   3570 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   3571 
   3572 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   3573 	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
   3574 
   3575 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   3576 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   3577 	rev64	q5, q5                                    @ GHASH block 4k+5
   3578 
   3579 	aese	q2, v29.16b                                     @ AES block 4k+6 - round 11
   3580 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   3581 
   3582 	aese	q3, v28.16b
   3583 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   3584 	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
   3585 
   3586 	fmov	d0, r10                               @ CTR block 4k+8
   3587 	add	r12, r12, #1                            @ CTR block 4k+8
   3588 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   3589 
   3590 	eor	q2, q6, q2                            @ AES block 4k+6 - result
   3591 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   3592 	rev	r9, r12                                 @ CTR block 4k+9
   3593 
   3594 	eor	r6, r6, r13                   @ AES block 4k+4 - round 12 low
   3595 #ifdef __ARMEB__
   3596 	rev	r6, r6
   3597 #endif
   3598 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   3599 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   3600 
   3601 	fmov	d1, r10                               @ CTR block 4k+9
   3602 	add	r12, r12, #1                            @ CTR block 4k+9
   3603 	eor	r19, r19, r13                   @ AES block 4k+5 - round 12 low
   3604 #ifdef __ARMEB__
   3605 	rev	r19, r19
   3606 #endif
   3607 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   3608 	rev	r9, r12                                 @ CTR block 4k+10
   3609 	eor	r20, r20, r14                   @ AES block 4k+5 - round 12 high
   3610 #ifdef __ARMEB__
   3611 	rev	r20, r20
   3612 #endif
   3613 	eor	r7, r7, r14                   @ AES block 4k+4 - round 12 high
   3614 #ifdef __ARMEB__
   3615 	rev	r7, r7
   3616 #endif
   3617 	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
   3618 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   3619 
   3620 	add	r12, r12, #1                            @ CTR block 4k+10
   3621 	rev64	q4, q4                                    @ GHASH block 4k+4
   3622 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   3623 
   3624 	aese	q3, v29.16b                                     @ AES block 4k+7 - round 11
   3625 	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
   3626 	blt	.L192_dec_main_loop
   3627 
   3628 .L192_dec_prepretail:@ PREPRETAIL
   3629 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   3630 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   3631 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   3632 
   3633 	aese	q1, v18.16b
   3634 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   3635 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   3636 
   3637 	aese	q0, v18.16b
   3638 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   3639 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   3640 
   3641 	eor	q4, q4, v11.16b                           @ PRE 1
   3642 	fmov	d2, r10                               @ CTR block 4k+6
   3643 
   3644 	aese	q1, v19.16b
   3645 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   3646 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   3647 
   3648 	aese	q0, v19.16b
   3649 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   3650 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   3651 
   3652 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   3653 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   3654 	fmov	d3, r10                               @ CTR block 4k+7
   3655 
   3656 	aese	q1, v20.16b
   3657 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   3658 	rev64	q6, q6                                    @ GHASH block 4k+2
   3659 
   3660 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   3661 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   3662 	rev	r9, r12                                 @ CTR block 4k+7
   3663 
   3664 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   3665 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   3666 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   3667 
   3668 	pmull	v31.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   3669 	eor	r24, r24, r14                   @ AES block 4k+3 - round 12 high
   3670 #ifdef __ARMEB__
   3671 	rev	r24, r24
   3672 #endif
   3673 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   3674 
   3675 	aese	q0, v20.16b
   3676 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   3677 	eor	r21, r21, r13                   @ AES block 4k+2 - round 12 low
   3678 #ifdef __ARMEB__
   3679 	rev	r21, r21
   3680 #endif
   3681 	pmull2	v30.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   3682 	eor	r22, r22, r14                   @ AES block 4k+2 - round 12 high
   3683 #ifdef __ARMEB__
   3684 	rev	r22, r22
   3685 #endif
   3686 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   3687 
   3688 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   3689 	eor	r23, r23, r13                   @ AES block 4k+3 - round 12 low
   3690 #ifdef __ARMEB__
   3691 	rev	r23, r23
   3692 #endif
   3693 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   3694 
   3695 	rev64	q7, q7                                    @ GHASH block 4k+3
   3696 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   3697 
   3698 	aese	q3, v18.16b
   3699 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   3700 	eor	q9, q9, v30.16b                         @ GHASH block 4k+1 - high
   3701 
   3702 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   3703 	add	r12, r12, #1                            @ CTR block 4k+7
   3704 
   3705 	pmull2	v30.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   3706 	eor	v11.16b, v11.16b, v31.16b                         @ GHASH block 4k+1 - low
   3707 
   3708 	aese	q2, v18.16b
   3709 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   3710 
   3711 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   3712 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   3713 
   3714 	aese	q3, v19.16b
   3715 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   3716 
   3717 	aese	q2, v19.16b
   3718 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   3719 	eor	q9, q9, v30.16b                         @ GHASH block 4k+2 - high
   3720 
   3721 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   3722 
   3723 	pmull	v8.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   3724 
   3725 	aese	q2, v20.16b
   3726 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   3727 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   3728 
   3729 	aese	q3, v20.16b
   3730 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   3731 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   3732 
   3733 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   3734 
   3735 	aese	q0, v21.16b
   3736 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   3737 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   3738 
   3739 	aese	q1, v21.16b
   3740 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   3741 
   3742 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   3743 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+2 - low
   3744 
   3745 	aese	q0, v22.16b
   3746 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   3747 
   3748 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   3749 	movi	q8, #0xc2
   3750 
   3751 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   3752 
   3753 	aese	q2, v21.16b
   3754 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   3755 
   3756 	shl	d8, d8, #56               @ mod_constant
   3757 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   3758 
   3759 	aese	q0, v23.16b
   3760 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   3761 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   3762 
   3763 	aese	q2, v22.16b
   3764 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   3765 
   3766 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   3767 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   3768 
   3769 	aese	q0, v24.16b
   3770 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   3771 
   3772 	aese	q3, v21.16b
   3773 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   3774 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   3775 
   3776 	aese	q2, v23.16b
   3777 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   3778 
   3779 	aese	q0, v25.16b
   3780 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   3781 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   3782 
   3783 	aese	q3, v22.16b
   3784 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   3785 
   3786 	aese	q2, v24.16b
   3787 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   3788 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   3789 
   3790 	aese	q0, v26.16b
   3791 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   3792 
   3793 	aese	q3, v23.16b
   3794 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   3795 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   3796 
   3797 	aese	q1, v22.16b
   3798 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   3799 
   3800 	aese	q2, v25.16b
   3801 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   3802 
   3803 	aese	q0, v27.16b
   3804 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   3805 
   3806 	aese	q1, v23.16b
   3807 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   3808 
   3809 	aese	q3, v24.16b
   3810 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   3811 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   3812 
   3813 	aese	q0, v28.16b
   3814 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   3815 
   3816 	aese	q1, v24.16b
   3817 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   3818 
   3819 	aese	q3, v25.16b
   3820 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   3821 
   3822 	aese	q2, v26.16b
   3823 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   3824 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   3825 
   3826 	aese	q1, v25.16b
   3827 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   3828 
   3829 	aese	q3, v26.16b
   3830 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   3831 
   3832 	aese	q2, v27.16b
   3833 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   3834 
   3835 	aese	q1, v26.16b
   3836 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   3837 
   3838 	aese	q3, v27.16b
   3839 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   3840 
   3841 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   3842 
   3843 	aese	q1, v27.16b
   3844 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   3845 
   3846 	aese	q2, v28.16b
   3847 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   3848 
   3849 	aese	q3, v28.16b
   3850 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   3851 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   3852 
   3853 	aese	q1, v28.16b
   3854 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   3855 
   3856 	aese	q0, v29.16b
   3857 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   3858 
   3859 	aese	q2, v29.16b
   3860 
   3861 	aese	q1, v29.16b
   3862 
   3863 	aese	q3, v29.16b
   3864 
   3865 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   3866 .L192_dec_tail:@ TAIL
   3867 
   3868 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   3869 	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
   3870 
   3871 	eor	q0, q5, q0                            @ AES block 4k+4 - result
   3872 
   3873 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   3874 
   3875 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   3876 
   3877 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   3878 
   3879 	cmp	r5, #48
   3880 
   3881 	eor	r7, r7, r14                   @ AES block 4k+4 - round 12 high
   3882 #ifdef __ARMEB__
   3883 	rev	r7, r7
   3884 #endif
   3885 	eor	r6, r6, r13                   @ AES block 4k+4 - round 12 low
   3886 #ifdef __ARMEB__
   3887 	rev	r6, r6
   3888 #endif
   3889 	bgt	.L192_dec_blocks_more_than_3
   3890 
   3891 	movi	v11.8b, #0
   3892 	movi	q9, #0
   3893 
   3894 	mov	q3, q2
   3895 	mov	q2, q1
   3896 	sub	r12, r12, #1
   3897 
   3898 	movi	v10.8b, #0
   3899 	cmp	r5, #32
   3900 	bgt	.L192_dec_blocks_more_than_2
   3901 
   3902 	mov	q3, q1
   3903 	cmp	r5, #16
   3904 	sub	r12, r12, #1
   3905 
   3906 	bgt	.L192_dec_blocks_more_than_1
   3907 
   3908 	sub	r12, r12, #1
   3909 	b	.L192_dec_blocks_less_than_1
   3910 .L192_dec_blocks_more_than_3:@ blocks left >  3
   3911 	rev64	q4, q5                                    @ GHASH final-3 block
   3912 	ld1	{ q5}, [r0], #16                      @ AES final-2 block - load ciphertext
   3913 
   3914 	stp	r6, r7, [r2], #16        @ AES final-3 block  - store result
   3915 
   3916 	eor	q4, q4, q8                           @ feed in partial tag
   3917 
   3918 	eor	q0, q5, q1                            @ AES final-2 block - result
   3919 
   3920 	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
   3921 	mov	r6, v0.d[0]                            @ AES final-2 block - mov low
   3922 	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
   3923 
   3924 	mov	r7, v0.d[1]                            @ AES final-2 block - mov high
   3925 
   3926 	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
   3927 	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
   3928 
   3929 	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
   3930 
   3931 	eor	r6, r6, r13                   @ AES final-2 block - round 12 low
   3932 #ifdef __ARMEB__
   3933 	rev	r6, r6
   3934 #endif
   3935 	movi	q8, #0                                        @ suppress further partial tag feed in
   3936 
   3937 	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
   3938 	eor	r7, r7, r14                   @ AES final-2 block - round 12 high
   3939 #ifdef __ARMEB__
   3940 	rev	r7, r7
   3941 #endif
   3942 .L192_dec_blocks_more_than_2:@ blocks left >  2
   3943 
   3944 	rev64	q4, q5                                    @ GHASH final-2 block
   3945 	ld1	{ q5}, [r0], #16                      @ AES final-1 block - load ciphertext
   3946 
   3947 	eor	q4, q4, q8                           @ feed in partial tag
   3948 
   3949 	movi	q8, #0                                        @ suppress further partial tag feed in
   3950 
   3951 	eor	q0, q5, q2                            @ AES final-1 block - result
   3952 
   3953 	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
   3954 
   3955 	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
   3956 
   3957 	stp	r6, r7, [r2], #16        @ AES final-2 block  - store result
   3958 
   3959 	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
   3960 	mov	r7, v0.d[1]                            @ AES final-1 block - mov high
   3961 
   3962 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
   3963 	mov	r6, v0.d[0]                            @ AES final-1 block - mov low
   3964 
   3965 	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
   3966 
   3967 	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
   3968 
   3969 	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
   3970 	eor	r7, r7, r14                   @ AES final-1 block - round 12 high
   3971 #ifdef __ARMEB__
   3972 	rev	r7, r7
   3973 #endif
   3974 	eor	r6, r6, r13                   @ AES final-1 block - round 12 low
   3975 #ifdef __ARMEB__
   3976 	rev	r6, r6
   3977 #endif
   3978 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
   3979 .L192_dec_blocks_more_than_1:@ blocks left >  1
   3980 
   3981 	rev64	q4, q5                                    @ GHASH final-1 block
   3982 
   3983 	eor	q4, q4, q8                           @ feed in partial tag
   3984 	ld1	{ q5}, [r0], #16                      @ AES final block - load ciphertext
   3985 
   3986 	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
   3987 
   3988 	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
   3989 
   3990 	eor	q0, q5, q3                            @ AES final block - result
   3991 	stp	r6, r7, [r2], #16        @ AES final-1 block  - store result
   3992 
   3993 	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
   3994 
   3995 	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
   3996 
   3997 	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
   3998 	mov	r7, v0.d[1]                            @ AES final block - mov high
   3999 
   4000 	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
   4001 	mov	r6, v0.d[0]                            @ AES final block - mov low
   4002 
   4003 	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
   4004 
   4005 	movi	q8, #0                                        @ suppress further partial tag feed in
   4006 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
   4007 	eor	r7, r7, r14                   @ AES final block - round 12 high
   4008 #ifdef __ARMEB__
   4009 	rev	r7, r7
   4010 #endif
   4011 	eor	r6, r6, r13                   @ AES final block - round 12 low
   4012 #ifdef __ARMEB__
   4013 	rev	r6, r6
   4014 #endif
   4015 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
   4016 .L192_dec_blocks_less_than_1:@ blocks left <= 1
   4017 
   4018 	mvn	r13, xzr                                      @ rk12_l = 0xffffffffffffffff
   4019 	ldp	r4, r5, [r2]  @ load existing bytes we need to not overwrite
   4020 	and	r1, r1, #127                    @ bit_length %= 128
   4021 
   4022 	sub	r1, r1, #128                    @ bit_length -= 128
   4023 
   4024 	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
   4025 
   4026 	and	r1, r1, #127                    @ bit_length %= 128
   4027 	mvn	r14, xzr                                      @ rk12_h = 0xffffffffffffffff
   4028 
   4029 	lsr	r14, r14, r1                     @ rk12_h is mask for top 64b of last block
   4030 	cmp	r1, #64
   4031 
   4032 	csel	r9, r13, r14, lt
   4033 	csel	r10, r14, xzr, lt
   4034 
   4035 	fmov	d0, r9                                   @ ctr0b is mask for last block
   4036 	and	r6, r6, r9
   4037 	bic	r4, r4, r9           @ mask out low existing bytes
   4038 
   4039 	orr	r6, r6, r4
   4040 	mov	v0.d[1], r10
   4041 #ifndef __ARMEB__
   4042 	rev	r9, r12
   4043 #else
   4044 	mov	r9, r12
   4045 #endif
   4046 
   4047 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
   4048 	str	r9, [r16, #12]                          @ store the updated counter
   4049 
   4050 	rev64	q4, q5                                    @ GHASH final block
   4051 
   4052 	eor	q4, q4, q8                           @ feed in partial tag
   4053 	bic	r5, r5, r10 @ mask out high existing bytes
   4054 
   4055 	and	r7, r7, r10
   4056 
   4057 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
   4058 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
   4059 
   4060 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
   4061 
   4062 	eor	q8, q8, q4                          @ GHASH final block - mid
   4063 
   4064 	eor	q9, q9, v20.16b                            @ GHASH final block - high
   4065 
   4066 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
   4067 
   4068 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
   4069 
   4070 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
   4071 	movi	q8, #0xc2
   4072 
   4073 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   4074 
   4075 	shl	d8, d8, #56               @ mod_constant
   4076 
   4077 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   4078 
   4079 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   4080 	orr	r7, r7, r5
   4081 	stp	r6, r7, [r2]
   4082 
   4083 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   4084 
   4085 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   4086 
   4087 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   4088 
   4089 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   4090 
   4091 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   4092 
   4093 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   4094 
   4095 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   4096 	ext	v11.16b, v11.16b, v11.16b, #8
   4097 	rev64	v11.16b, v11.16b
   4098 	mov	r0, r15
   4099 	st1	{ v11.16b }, [r3]
   4100 
   4101 	ldp	r21, r22, [sp, #16]
   4102 	ldp	r23, r24, [sp, #32]
   4103 	ldp	d8, d9, [sp, #48]
   4104 	ldp	d10, d11, [sp, #64]
   4105 	ldp	d12, d13, [sp, #80]
   4106 	ldp	d14, d15, [sp, #96]
   4107 	ldp	r19, r20, [sp], #112
   4108 	RET
   4109 
   4110 .L192_dec_ret:
   4111 	mov	r0, #0x0
   4112 	RET
   4113 .size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
   4114 .globl	aes_gcm_enc_256_kernel
   4115 .type	aes_gcm_enc_256_kernel,%function
   4116 .align	4
   4117 aes_gcm_enc_256_kernel:
   4118 	cbz	r1, .L256_enc_ret
   4119 	stp	r19, r20, [sp, #-112]!
   4120 	mov	r16, r4
   4121 	mov	r8, r5
   4122 	stp	r21, r22, [sp, #16]
   4123 	stp	r23, r24, [sp, #32]
   4124 	stp	d8, d9, [sp, #48]
   4125 	stp	d10, d11, [sp, #64]
   4126 	stp	d12, d13, [sp, #80]
   4127 	stp	d14, d15, [sp, #96]
   4128 
   4129 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   4130 	lsr	r5, r1, #3              @ byte_len
   4131 	mov	r15, r5
   4132 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   4133 #ifdef __ARMEB__
   4134 	rev	r10, r10
   4135 	rev	r11, r11
   4136 #endif
   4137 	ldp	r13, r14, [r8, #224]                     @ load rk14
   4138 #ifdef __ARMEB__
   4139 	ror	r13, r13, #32
   4140 	ror	r14, r14, #32
   4141 #endif
   4142 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   4143 	sub	r5, r5, #1      @ byte_len - 1
   4144 
   4145 	ld1	{v18.4s}, [r8], #16                               @ load rk0
   4146 	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   4147 
   4148 	ld1	{v19.4s}, [r8], #16                               @ load rk1
   4149 	add	r5, r5, r0
   4150 
   4151 	lsr	r12, r11, #32
   4152 	fmov	d2, r10                               @ CTR block 2
   4153 	orr	r11, r11, r11
   4154 
   4155 	rev	r12, r12                                @ rev_ctr32
   4156 	cmp	r0, r5                   @ check if we have <= 4 blocks
   4157 	fmov	d1, r10                               @ CTR block 1
   4158 
   4159 	aese	q0, v18.16b
   4160 	aesmc	q0, q0          @ AES block 0 - round 0
   4161 	add	r12, r12, #1                            @ increment rev_ctr32
   4162 
   4163 	rev	r9, r12                                 @ CTR block 1
   4164 	fmov	d3, r10                               @ CTR block 3
   4165 
   4166 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   4167 	add	r12, r12, #1                            @ CTR block 1
   4168 	ld1	{v20.4s}, [r8], #16                               @ load rk2
   4169 
   4170 	fmov	v1.d[1], r9                               @ CTR block 1
   4171 	rev	r9, r12                                 @ CTR block 2
   4172 	add	r12, r12, #1                            @ CTR block 2
   4173 
   4174 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   4175 	ld1	{v21.4s}, [r8], #16                               @ load rk3
   4176 
   4177 	fmov	v2.d[1], r9                               @ CTR block 2
   4178 	rev	r9, r12                                 @ CTR block 3
   4179 
   4180 	aese	q0, v19.16b
   4181 	aesmc	q0, q0          @ AES block 0 - round 1
   4182 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   4183 
   4184 	fmov	v3.d[1], r9                               @ CTR block 3
   4185 
   4186 	aese	q1, v18.16b
   4187 	aesmc	q1, q1          @ AES block 1 - round 0
   4188 	ld1	{v22.4s}, [r8], #16                               @ load rk4
   4189 
   4190 	aese	q0, v20.16b
   4191 	aesmc	q0, q0          @ AES block 0 - round 2
   4192 	ld1	{v23.4s}, [r8], #16                               @ load rk5
   4193 
   4194 	aese	q2, v18.16b
   4195 	aesmc	q2, q2          @ AES block 2 - round 0
   4196 	ld1	{v24.4s}, [r8], #16                               @ load rk6
   4197 
   4198 	aese	q1, v19.16b
   4199 	aesmc	q1, q1          @ AES block 1 - round 1
   4200 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   4201 #ifndef __ARMEB__
   4202 	ext	v14.16b, v14.16b, v14.16b, #8
   4203 #endif
   4204 	aese	q3, v18.16b
   4205 	aesmc	q3, q3          @ AES block 3 - round 0
   4206 	ld1	{v25.4s}, [r8], #16                               @ load rk7
   4207 
   4208 	aese	q2, v19.16b
   4209 	aesmc	q2, q2          @ AES block 2 - round 1
   4210 	ld1	{v26.4s}, [r8], #16                               @ load rk8
   4211 
   4212 	aese	q1, v20.16b
   4213 	aesmc	q1, q1          @ AES block 1 - round 2
   4214 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   4215 #ifndef __ARMEB__
   4216 	ext	v13.16b, v13.16b, v13.16b, #8
   4217 #endif
   4218 	aese	q3, v19.16b
   4219 	aesmc	q3, q3          @ AES block 3 - round 1
   4220 	ld1	{v27.4s}, [r8], #16                               @ load rk9
   4221 
   4222 	aese	q2, v20.16b
   4223 	aesmc	q2, q2          @ AES block 2 - round 2
   4224 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   4225 #ifndef __ARMEB__
   4226 	ext	v15.16b, v15.16b, v15.16b, #8
   4227 #endif
   4228 	aese	q1, v21.16b
   4229 	aesmc	q1, q1          @ AES block 1 - round 3
   4230 	ld1	{v28.4s}, [r8], #16                              @ load rk10
   4231 
   4232 	aese	q3, v20.16b
   4233 	aesmc	q3, q3          @ AES block 3 - round 2
   4234 	ld1	{v29.4s}, [r8], #16                              @ load rk11
   4235 
   4236 	aese	q2, v21.16b
   4237 	aesmc	q2, q2          @ AES block 2 - round 3
   4238 	add	r12, r12, #1                            @ CTR block 3
   4239 
   4240 	aese	q0, v21.16b
   4241 	aesmc	q0, q0          @ AES block 0 - round 3
   4242 
   4243 	aese	q3, v21.16b
   4244 	aesmc	q3, q3          @ AES block 3 - round 3
   4245 	ld1	{ v11.16b}, [r3]
   4246 	ext	v11.16b, v11.16b, v11.16b, #8
   4247 	rev64	v11.16b, v11.16b
   4248 
   4249 	aese	q2, v22.16b
   4250 	aesmc	q2, q2          @ AES block 2 - round 4
   4251 
   4252 	aese	q0, v22.16b
   4253 	aesmc	q0, q0          @ AES block 0 - round 4
   4254 
   4255 	aese	q1, v22.16b
   4256 	aesmc	q1, q1          @ AES block 1 - round 4
   4257 
   4258 	aese	q3, v22.16b
   4259 	aesmc	q3, q3          @ AES block 3 - round 4
   4260 
   4261 	aese	q0, v23.16b
   4262 	aesmc	q0, q0          @ AES block 0 - round 5
   4263 
   4264 	aese	q1, v23.16b
   4265 	aesmc	q1, q1          @ AES block 1 - round 5
   4266 
   4267 	aese	q3, v23.16b
   4268 	aesmc	q3, q3          @ AES block 3 - round 5
   4269 
   4270 	aese	q2, v23.16b
   4271 	aesmc	q2, q2          @ AES block 2 - round 5
   4272 
   4273 	aese	q1, v24.16b
   4274 	aesmc	q1, q1          @ AES block 1 - round 6
   4275 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   4276 
   4277 	aese	q3, v24.16b
   4278 	aesmc	q3, q3          @ AES block 3 - round 6
   4279 	ld1	{v30.4s}, [r8], #16                              @ load rk12
   4280 
   4281 	aese	q0, v24.16b
   4282 	aesmc	q0, q0          @ AES block 0 - round 6
   4283 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   4284 #ifndef __ARMEB__
   4285 	ext	v12.16b, v12.16b, v12.16b, #8
   4286 #endif
   4287 	aese	q2, v24.16b
   4288 	aesmc	q2, q2          @ AES block 2 - round 6
   4289 	ld1	{v31.4s}, [r8], #16                              @ load rk13
   4290 
   4291 	aese	q1, v25.16b
   4292 	aesmc	q1, q1          @ AES block 1 - round 7
   4293 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   4294 
   4295 	aese	q0, v25.16b
   4296 	aesmc	q0, q0          @ AES block 0 - round 7
   4297 
   4298 	aese	q2, v25.16b
   4299 	aesmc	q2, q2          @ AES block 2 - round 7
   4300 
   4301 	aese	q3, v25.16b
   4302 	aesmc	q3, q3          @ AES block 3 - round 7
   4303 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   4304 
   4305 	aese	q1, v26.16b
   4306 	aesmc	q1, q1          @ AES block 1 - round 8
   4307 
   4308 	aese	q2, v26.16b
   4309 	aesmc	q2, q2          @ AES block 2 - round 8
   4310 
   4311 	aese	q3, v26.16b
   4312 	aesmc	q3, q3          @ AES block 3 - round 8
   4313 
   4314 	aese	q1, v27.16b
   4315 	aesmc	q1, q1          @ AES block 1 - round 9
   4316 
   4317 	aese	q2, v27.16b
   4318 	aesmc	q2, q2          @ AES block 2 - round 9
   4319 
   4320 	aese	q0, v26.16b
   4321 	aesmc	q0, q0          @ AES block 0 - round 8
   4322 
   4323 	aese	q1, v28.16b
   4324 	aesmc	q1, q1          @ AES block 1 - round 10
   4325 
   4326 	aese	q3, v27.16b
   4327 	aesmc	q3, q3          @ AES block 3 - round 9
   4328 
   4329 	aese	q0, v27.16b
   4330 	aesmc	q0, q0          @ AES block 0 - round 9
   4331 
   4332 	aese	q2, v28.16b
   4333 	aesmc	q2, q2          @ AES block 2 - round 10
   4334 
   4335 	aese	q3, v28.16b
   4336 	aesmc	q3, q3          @ AES block 3 - round 10
   4337 
   4338 	aese	q1, v29.16b
   4339 	aesmc	q1, q1          @ AES block 1 - round 11
   4340 
   4341 	aese	q2, v29.16b
   4342 	aesmc	q2, q2          @ AES block 2 - round 11
   4343 
   4344 	aese	q0, v28.16b
   4345 	aesmc	q0, q0          @ AES block 0 - round 10
   4346 
   4347 	aese	q1, v30.16b
   4348 	aesmc	q1, q1          @ AES block 1 - round 12
   4349 
   4350 	aese	q2, v30.16b
   4351 	aesmc	q2, q2          @ AES block 2 - round 12
   4352 
   4353 	aese	q0, v29.16b
   4354 	aesmc	q0, q0          @ AES block 0 - round 11
   4355 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   4356 
   4357 	aese	q3, v29.16b
   4358 	aesmc	q3, q3          @ AES block 3 - round 11
   4359 
   4360 	aese	q2, v31.16b                                     @ AES block 2 - round 13
   4361 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   4362 
   4363 	aese	q0, v30.16b
   4364 	aesmc	q0, q0          @ AES block 0 - round 12
   4365 
   4366 	aese	q3, v30.16b
   4367 	aesmc	q3, q3          @ AES block 3 - round 12
   4368 
   4369 	aese	q1, v31.16b                                     @ AES block 1 - round 13
   4370 
   4371 	aese	q0, v31.16b                                     @ AES block 0 - round 13
   4372 
   4373 	aese	q3, v31.16b                                     @ AES block 3 - round 13
   4374 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   4375 	bge	.L256_enc_tail                                    @ handle tail
   4376 
   4377 	ldp	r19, r20, [r0, #16]           @ AES block 1 - load plaintext
   4378 #ifdef __ARMEB__
   4379 	rev	r19, r19
   4380 	rev	r20, r20
   4381 #endif
   4382 	rev	r9, r12                                 @ CTR block 4
   4383 	ldp	r6, r7, [r0, #0]            @ AES block 0 - load plaintext
   4384 #ifdef __ARMEB__
   4385 	rev	r6, r6
   4386 	rev	r7, r7
   4387 #endif
   4388 	ldp	r23, r24, [r0, #48]           @ AES block 3 - load plaintext
   4389 #ifdef __ARMEB__
   4390 	rev	r23, r23
   4391 	rev	r24, r24
   4392 #endif
   4393 	ldp	r21, r22, [r0, #32]           @ AES block 2 - load plaintext
   4394 #ifdef __ARMEB__
   4395 	rev	r21, r21
   4396 	rev	r22, r22
   4397 #endif
   4398 	add	r0, r0, #64                       @ AES input_ptr update
   4399 
   4400 	eor	r19, r19, r13                     @ AES block 1 - round 14 low
   4401 	eor	r20, r20, r14                     @ AES block 1 - round 14 high
   4402 
   4403 	fmov	d5, r19                               @ AES block 1 - mov low
   4404 	eor	r6, r6, r13                     @ AES block 0 - round 14 low
   4405 
   4406 	eor	r7, r7, r14                     @ AES block 0 - round 14 high
   4407 	eor	r24, r24, r14                     @ AES block 3 - round 14 high
   4408 	fmov	d4, r6                               @ AES block 0 - mov low
   4409 
   4410 	cmp	r0, r5                   @ check if we have <= 8 blocks
   4411 	fmov	v4.d[1], r7                           @ AES block 0 - mov high
   4412 	eor	r23, r23, r13                     @ AES block 3 - round 14 low
   4413 
   4414 	eor	r21, r21, r13                     @ AES block 2 - round 14 low
   4415 	fmov	v5.d[1], r20                           @ AES block 1 - mov high
   4416 
   4417 	fmov	d6, r21                               @ AES block 2 - mov low
   4418 	add	r12, r12, #1                            @ CTR block 4
   4419 
   4420 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   4421 	fmov	d7, r23                               @ AES block 3 - mov low
   4422 	eor	r22, r22, r14                     @ AES block 2 - round 14 high
   4423 
   4424 	fmov	v6.d[1], r22                           @ AES block 2 - mov high
   4425 
   4426 	eor	q4, q4, q0                          @ AES block 0 - result
   4427 	fmov	d0, r10                               @ CTR block 4
   4428 
   4429 	fmov	v0.d[1], r9                               @ CTR block 4
   4430 	rev	r9, r12                                 @ CTR block 5
   4431 	add	r12, r12, #1                            @ CTR block 5
   4432 
   4433 	eor	q5, q5, q1                          @ AES block 1 - result
   4434 	fmov	d1, r10                               @ CTR block 5
   4435 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   4436 
   4437 	fmov	v1.d[1], r9                               @ CTR block 5
   4438 	rev	r9, r12                                 @ CTR block 6
   4439 	st1	{ q4}, [r2], #16                     @ AES block 0 - store result
   4440 
   4441 	fmov	v7.d[1], r24                           @ AES block 3 - mov high
   4442 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   4443 	eor	q6, q6, q2                          @ AES block 2 - result
   4444 
   4445 	st1	{ q5}, [r2], #16                     @ AES block 1 - store result
   4446 
   4447 	add	r12, r12, #1                            @ CTR block 6
   4448 	fmov	d2, r10                               @ CTR block 6
   4449 
   4450 	fmov	v2.d[1], r9                               @ CTR block 6
   4451 	st1	{ q6}, [r2], #16                     @ AES block 2 - store result
   4452 	rev	r9, r12                                 @ CTR block 7
   4453 
   4454 	orr	r9, r11, r9, lsl #32            @ CTR block 7
   4455 
   4456 	eor	q7, q7, q3                          @ AES block 3 - result
   4457 	st1	{ q7}, [r2], #16                     @ AES block 3 - store result
   4458 	bge	.L256_enc_prepretail                               @ do prepretail
   4459 
   4460 .L256_enc_main_loop:@ main loop start
   4461 	aese	q0, v18.16b
   4462 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   4463 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
   4464 
   4465 	aese	q1, v18.16b
   4466 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   4467 	fmov	d3, r10                               @ CTR block 4k+3
   4468 
   4469 	aese	q2, v18.16b
   4470 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   4471 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   4472 
   4473 	aese	q0, v19.16b
   4474 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   4475 	fmov	v3.d[1], r9                               @ CTR block 4k+3
   4476 
   4477 	aese	q1, v19.16b
   4478 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   4479 	ldp	r23, r24, [r0, #48]           @ AES block 4k+7 - load plaintext
   4480 #ifdef __ARMEB__
   4481 	rev	r23, r23
   4482 	rev	r24, r24
   4483 #endif
   4484 	aese	q2, v19.16b
   4485 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   4486 	ldp	r21, r22, [r0, #32]           @ AES block 4k+6 - load plaintext
   4487 #ifdef __ARMEB__
   4488 	rev	r21, r21
   4489 	rev	r22, r22
   4490 #endif
   4491 	aese	q0, v20.16b
   4492 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   4493 	eor	q4, q4, v11.16b                           @ PRE 1
   4494 
   4495 	aese	q1, v20.16b
   4496 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   4497 
   4498 	aese	q3, v18.16b
   4499 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   4500 	eor	r23, r23, r13                     @ AES block 4k+7 - round 14 low
   4501 
   4502 	aese	q0, v21.16b
   4503 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   4504 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   4505 
   4506 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   4507 	eor	r22, r22, r14                     @ AES block 4k+6 - round 14 high
   4508 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   4509 
   4510 	aese	q3, v19.16b
   4511 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   4512 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
   4513 
   4514 	aese	q0, v22.16b
   4515 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   4516 
   4517 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   4518 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   4519 
   4520 	aese	q2, v20.16b
   4521 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   4522 
   4523 	aese	q0, v23.16b
   4524 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   4525 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4526 
   4527 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   4528 
   4529 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   4530 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
   4531 
   4532 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   4533 
   4534 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   4535 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   4536 
   4537 	aese	q1, v21.16b
   4538 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   4539 
   4540 	aese	q3, v20.16b
   4541 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   4542 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   4543 
   4544 	aese	q2, v21.16b
   4545 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   4546 
   4547 	aese	q1, v22.16b
   4548 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   4549 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   4550 
   4551 	aese	q3, v21.16b
   4552 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   4553 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   4554 
   4555 	aese	q2, v22.16b
   4556 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   4557 
   4558 	aese	q0, v24.16b
   4559 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   4560 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   4561 
   4562 	aese	q3, v22.16b
   4563 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   4564 
   4565 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   4566 
   4567 	aese	q0, v25.16b
   4568 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   4569 
   4570 	aese	q3, v23.16b
   4571 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   4572 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   4573 
   4574 	aese	q1, v23.16b
   4575 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   4576 
   4577 	aese	q0, v26.16b
   4578 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   4579 
   4580 	aese	q2, v23.16b
   4581 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   4582 
   4583 	aese	q1, v24.16b
   4584 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   4585 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   4586 
   4587 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   4588 
   4589 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   4590 
   4591 	aese	q1, v25.16b
   4592 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   4593 
   4594 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   4595 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   4596 
   4597 	aese	q3, v24.16b
   4598 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   4599 	ldp	r19, r20, [r0, #16]           @ AES block 4k+5 - load plaintext
   4600 #ifdef __ARMEB__
   4601 	rev	r19, r19
   4602 	rev	r20, r20
   4603 #endif
   4604 	aese	q1, v26.16b
   4605 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   4606 	mov	d4, v7.d[1]                                  @ GHASH block 4k+3 - mid
   4607 
   4608 	aese	q2, v24.16b
   4609 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   4610 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   4611 
   4612 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   4613 
   4614 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   4615 	eor	q4, q4, q7                          @ GHASH block 4k+3 - mid
   4616 
   4617 	aese	q2, v25.16b
   4618 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   4619 	eor	r19, r19, r13                     @ AES block 4k+5 - round 14 low
   4620 
   4621 	aese	q1, v27.16b
   4622 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   4623 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   4624 
   4625 	aese	q3, v25.16b
   4626 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   4627 	eor	r21, r21, r13                     @ AES block 4k+6 - round 14 low
   4628 
   4629 	aese	q0, v27.16b
   4630 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   4631 	movi	q8, #0xc2
   4632 
   4633 	pmull	v4.1q, q4, v16.1d                          @ GHASH block 4k+3 - mid
   4634 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   4635 	fmov	d5, r19                               @ AES block 4k+5 - mov low
   4636 
   4637 	aese	q2, v26.16b
   4638 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   4639 	ldp	r6, r7, [r0, #0]            @ AES block 4k+4 - load plaintext
   4640 #ifdef __ARMEB__
   4641 	rev	r6, r6
   4642 	rev	r7, r7
   4643 #endif
   4644 	aese	q0, v28.16b
   4645 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   4646 	shl	d8, d8, #56               @ mod_constant
   4647 
   4648 	aese	q3, v26.16b
   4649 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   4650 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   4651 
   4652 	aese	q2, v27.16b
   4653 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   4654 
   4655 	aese	q1, v28.16b
   4656 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   4657 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+3 - mid
   4658 
   4659 	aese	q3, v27.16b
   4660 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   4661 	add	r12, r12, #1                            @ CTR block 4k+3
   4662 
   4663 	aese	q0, v29.16b
   4664 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   4665 	eor	q4, v11.16b, q9                         @ MODULO - karatsuba tidy up
   4666 
   4667 	aese	q1, v29.16b
   4668 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   4669 	add	r0, r0, #64                       @ AES input_ptr update
   4670 
   4671 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   4672 	rev	r9, r12                                 @ CTR block 4k+8
   4673 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   4674 
   4675 	aese	q2, v28.16b
   4676 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   4677 	eor	r6, r6, r13                     @ AES block 4k+4 - round 14 low
   4678 
   4679 	aese	q1, v30.16b
   4680 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   4681 	eor	v10.16b, v10.16b, q4                         @ MODULO - karatsuba tidy up
   4682 
   4683 	aese	q3, v28.16b
   4684 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   4685 	eor	r7, r7, r14                     @ AES block 4k+4 - round 14 high
   4686 
   4687 	fmov	d4, r6                               @ AES block 4k+4 - mov low
   4688 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   4689 	eor	q7, q9, q7                   @ MODULO - fold into mid
   4690 
   4691 	aese	q0, v30.16b
   4692 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   4693 	eor	r20, r20, r14                     @ AES block 4k+5 - round 14 high
   4694 
   4695 	aese	q2, v29.16b
   4696 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   4697 	eor	r24, r24, r14                     @ AES block 4k+7 - round 14 high
   4698 
   4699 	aese	q3, v29.16b
   4700 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   4701 	add	r12, r12, #1                            @ CTR block 4k+8
   4702 
   4703 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   4704 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
   4705 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   4706 
   4707 	aese	q2, v30.16b
   4708 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   4709 	fmov	d7, r23                               @ AES block 4k+7 - mov low
   4710 
   4711 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   4712 	fmov	v5.d[1], r20                           @ AES block 4k+5 - mov high
   4713 
   4714 	fmov	d6, r21                               @ AES block 4k+6 - mov low
   4715 	cmp	r0, r5                   @ .LOOP CONTROL
   4716 
   4717 	fmov	v6.d[1], r22                           @ AES block 4k+6 - mov high
   4718 
   4719 	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
   4720 	eor	q4, q4, q0                          @ AES block 4k+4 - result
   4721 	fmov	d0, r10                               @ CTR block 4k+8
   4722 
   4723 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   4724 	rev	r9, r12                                 @ CTR block 4k+9
   4725 	add	r12, r12, #1                            @ CTR block 4k+9
   4726 
   4727 	eor	q5, q5, q1                          @ AES block 4k+5 - result
   4728 	fmov	d1, r10                               @ CTR block 4k+9
   4729 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   4730 
   4731 	aese	q3, v30.16b
   4732 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   4733 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   4734 
   4735 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   4736 	rev	r9, r12                                 @ CTR block 4k+10
   4737 	st1	{ q4}, [r2], #16                     @ AES block 4k+4 - store result
   4738 
   4739 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   4740 	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
   4741 	fmov	v7.d[1], r24                           @ AES block 4k+7 - mov high
   4742 
   4743 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   4744 	st1	{ q5}, [r2], #16                     @ AES block 4k+5 - store result
   4745 	add	r12, r12, #1                            @ CTR block 4k+10
   4746 
   4747 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   4748 	eor	q6, q6, q2                          @ AES block 4k+6 - result
   4749 	fmov	d2, r10                               @ CTR block 4k+10
   4750 
   4751 	st1	{ q6}, [r2], #16                     @ AES block 4k+6 - store result
   4752 	fmov	v2.d[1], r9                               @ CTR block 4k+10
   4753 	rev	r9, r12                                 @ CTR block 4k+11
   4754 
   4755 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   4756 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+11
   4757 
   4758 	eor	q7, q7, q3                          @ AES block 4k+7 - result
   4759 	st1	{ q7}, [r2], #16                     @ AES block 4k+7 - store result
   4760 	blt	.L256_enc_main_loop
   4761 
   4762 .L256_enc_prepretail:@ PREPRETAIL
   4763 	aese	q1, v18.16b
   4764 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   4765 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
   4766 
   4767 	aese	q2, v18.16b
   4768 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   4769 	fmov	d3, r10                               @ CTR block 4k+3
   4770 
   4771 	aese	q0, v18.16b
   4772 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   4773 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
   4774 
   4775 	fmov	v3.d[1], r9                               @ CTR block 4k+3
   4776 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   4777 
   4778 	aese	q2, v19.16b
   4779 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   4780 
   4781 	aese	q0, v19.16b
   4782 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   4783 
   4784 	eor	q4, q4, v11.16b                           @ PRE 1
   4785 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
   4786 
   4787 	aese	q2, v20.16b
   4788 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   4789 
   4790 	aese	q3, v18.16b
   4791 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   4792 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   4793 
   4794 	aese	q1, v19.16b
   4795 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   4796 
   4797 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   4798 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   4799 
   4800 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   4801 
   4802 	aese	q2, v21.16b
   4803 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   4804 
   4805 	aese	q1, v20.16b
   4806 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   4807 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   4808 
   4809 	aese	q0, v20.16b
   4810 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   4811 
   4812 	aese	q3, v19.16b
   4813 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   4814 
   4815 	aese	q1, v21.16b
   4816 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   4817 
   4818 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   4819 
   4820 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   4821 
   4822 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   4823 
   4824 	aese	q3, v20.16b
   4825 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   4826 
   4827 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   4828 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   4829 
   4830 	aese	q0, v21.16b
   4831 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   4832 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   4833 
   4834 	aese	q3, v21.16b
   4835 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   4836 
   4837 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   4838 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   4839 
   4840 	aese	q0, v22.16b
   4841 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   4842 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4843 
   4844 	aese	q3, v22.16b
   4845 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   4846 
   4847 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   4848 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   4849 	add	r12, r12, #1                            @ CTR block 4k+3
   4850 
   4851 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   4852 
   4853 	aese	q3, v23.16b
   4854 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   4855 
   4856 	aese	q2, v22.16b
   4857 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   4858 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   4859 
   4860 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   4861 
   4862 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   4863 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   4864 
   4865 	aese	q2, v23.16b
   4866 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   4867 
   4868 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   4869 	mov	d4, v7.d[1]                                  @ GHASH block 4k+3 - mid
   4870 
   4871 	aese	q1, v22.16b
   4872 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   4873 
   4874 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   4875 
   4876 	eor	q4, q4, q7                          @ GHASH block 4k+3 - mid
   4877 
   4878 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   4879 
   4880 	aese	q1, v23.16b
   4881 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   4882 
   4883 	pmull	v4.1q, q4, v16.1d                          @ GHASH block 4k+3 - mid
   4884 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   4885 
   4886 	aese	q0, v23.16b
   4887 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   4888 
   4889 	aese	q1, v24.16b
   4890 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   4891 
   4892 	aese	q2, v24.16b
   4893 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   4894 
   4895 	aese	q0, v24.16b
   4896 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   4897 	movi	q8, #0xc2
   4898 
   4899 	aese	q3, v24.16b
   4900 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   4901 
   4902 	aese	q1, v25.16b
   4903 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   4904 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   4905 
   4906 	aese	q0, v25.16b
   4907 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   4908 
   4909 	aese	q3, v25.16b
   4910 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   4911 	shl	d8, d8, #56               @ mod_constant
   4912 
   4913 	aese	q1, v26.16b
   4914 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   4915 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+3 - mid
   4916 
   4917 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   4918 
   4919 	aese	q3, v26.16b
   4920 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   4921 
   4922 	aese	q1, v27.16b
   4923 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   4924 
   4925 	aese	q0, v26.16b
   4926 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   4927 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   4928 
   4929 	aese	q3, v27.16b
   4930 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   4931 
   4932 	eor	v10.16b, v10.16b, q9                         @ karatsuba tidy up
   4933 
   4934 	pmull	v4.1q, q9, q8
   4935 	ext	q9, q9, q9, #8
   4936 
   4937 	aese	q3, v28.16b
   4938 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   4939 
   4940 	aese	q2, v25.16b
   4941 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   4942 	eor	v10.16b, v10.16b, v11.16b
   4943 
   4944 	aese	q1, v28.16b
   4945 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   4946 
   4947 	aese	q0, v27.16b
   4948 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   4949 
   4950 	aese	q2, v26.16b
   4951 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   4952 
   4953 	aese	q1, v29.16b
   4954 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   4955 	eor	v10.16b, v10.16b, q4
   4956 
   4957 	aese	q0, v28.16b
   4958 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   4959 
   4960 	aese	q2, v27.16b
   4961 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   4962 
   4963 	aese	q1, v30.16b
   4964 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   4965 
   4966 	aese	q0, v29.16b
   4967 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   4968 	eor	v10.16b, v10.16b, q9
   4969 
   4970 	aese	q3, v29.16b
   4971 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   4972 
   4973 	aese	q2, v28.16b
   4974 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   4975 
   4976 	aese	q0, v30.16b
   4977 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   4978 
   4979 	pmull	v4.1q, v10.1d, q8
   4980 
   4981 	aese	q2, v29.16b
   4982 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   4983 	ext	v10.16b, v10.16b, v10.16b, #8
   4984 
   4985 	aese	q3, v30.16b
   4986 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   4987 
   4988 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   4989 	eor	v11.16b, v11.16b, q4
   4990 
   4991 	aese	q2, v30.16b
   4992 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   4993 
   4994 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   4995 
   4996 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   4997 
   4998 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   4999 	eor	v11.16b, v11.16b, v10.16b
   5000 .L256_enc_tail:@ TAIL
   5001 
   5002 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   5003 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   5004 	ldp	r6, r7, [r0], #16           @ AES block 4k+4 - load plaintext
   5005 #ifdef __ARMEB__
   5006 	rev	r6, r6
   5007 	rev	r7, r7
   5008 #endif
   5009 	eor	r6, r6, r13                     @ AES block 4k+4 - round 14 low
   5010 	eor	r7, r7, r14                     @ AES block 4k+4 - round 14 high
   5011 
   5012 	cmp	r5, #48
   5013 	fmov	d4, r6                               @ AES block 4k+4 - mov low
   5014 
   5015 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
   5016 
   5017 	eor	q5, q4, q0                          @ AES block 4k+4 - result
   5018 	bgt	.L256_enc_blocks_more_than_3
   5019 
   5020 	cmp	r5, #32
   5021 	mov	q3, q2
   5022 	movi	v11.8b, #0
   5023 
   5024 	movi	q9, #0
   5025 	sub	r12, r12, #1
   5026 
   5027 	mov	q2, q1
   5028 	movi	v10.8b, #0
   5029 	bgt	.L256_enc_blocks_more_than_2
   5030 
   5031 	mov	q3, q1
   5032 	sub	r12, r12, #1
   5033 	cmp	r5, #16
   5034 
   5035 	bgt	.L256_enc_blocks_more_than_1
   5036 
   5037 	sub	r12, r12, #1
   5038 	b	.L256_enc_blocks_less_than_1
   5039 .L256_enc_blocks_more_than_3:@ blocks left >  3
   5040 	st1	{ q5}, [r2], #16                    @ AES final-3 block  - store result
   5041 
   5042 	ldp	r6, r7, [r0], #16          @ AES final-2 block - load input low & high
   5043 #ifdef __ARMEB__
   5044 	rev	r6, r6
   5045 	rev	r7, r7
   5046 #endif
   5047 	rev64	q4, q5                                   @ GHASH final-3 block
   5048 
   5049 	eor	r6, r6, r13                    @ AES final-2 block - round 14 low
   5050 	eor	q4, q4, q8                          @ feed in partial tag
   5051 
   5052 	eor	r7, r7, r14                    @ AES final-2 block - round 14 high
   5053 
   5054 	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
   5055 	fmov	d5, r6                                @ AES final-2 block - mov low
   5056 
   5057 	fmov	v5.d[1], r7                            @ AES final-2 block - mov high
   5058 
   5059 	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
   5060 	movi	q8, #0                                       @ suppress further partial tag feed in
   5061 
   5062 	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
   5063 
   5064 	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
   5065 
   5066 	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
   5067 
   5068 	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
   5069 	eor	q5, q5, q1                           @ AES final-2 block - result
   5070 .L256_enc_blocks_more_than_2:@ blocks left >  2
   5071 
   5072 	st1	{ q5}, [r2], #16                    @ AES final-2 block - store result
   5073 
   5074 	ldp	r6, r7, [r0], #16          @ AES final-1 block - load input low & high
   5075 #ifdef __ARMEB__
   5076 	rev	r6, r6
   5077 	rev	r7, r7
   5078 #endif
   5079 	rev64	q4, q5                                   @ GHASH final-2 block
   5080 
   5081 	eor	r6, r6, r13                    @ AES final-1 block - round 14 low
   5082 	eor	q4, q4, q8                          @ feed in partial tag
   5083 
   5084 	fmov	d5, r6                                @ AES final-1 block - mov low
   5085 	eor	r7, r7, r14                    @ AES final-1 block - round 14 high
   5086 
   5087 	fmov	v5.d[1], r7                            @ AES final-1 block - mov high
   5088 
   5089 	movi	q8, #0                                       @ suppress further partial tag feed in
   5090 
   5091 	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
   5092 	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
   5093 
   5094 	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
   5095 
   5096 	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
   5097 
   5098 	eor	q5, q5, q2                           @ AES final-1 block - result
   5099 
   5100 	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
   5101 
   5102 	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
   5103 
   5104 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
   5105 
   5106 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
   5107 .L256_enc_blocks_more_than_1:@ blocks left >  1
   5108 
   5109 	st1	{ q5}, [r2], #16                    @ AES final-1 block - store result
   5110 
   5111 	rev64	q4, q5                                   @ GHASH final-1 block
   5112 
   5113 	ldp	r6, r7, [r0], #16          @ AES final block - load input low & high
   5114 #ifdef __ARMEB__
   5115 	rev	r6, r6
   5116 	rev	r7, r7
   5117 #endif
   5118 	eor	q4, q4, q8                          @ feed in partial tag
   5119 
   5120 	movi	q8, #0                                       @ suppress further partial tag feed in
   5121 
   5122 	eor	r6, r6, r13                    @ AES final block - round 14 low
   5123 	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
   5124 
   5125 	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
   5126 	eor	r7, r7, r14                    @ AES final block - round 14 high
   5127 
   5128 	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
   5129 
   5130 	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
   5131 
   5132 	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
   5133 	fmov	d5, r6                                @ AES final block - mov low
   5134 
   5135 	fmov	v5.d[1], r7                            @ AES final block - mov high
   5136 
   5137 	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
   5138 
   5139 	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
   5140 
   5141 	eor	q5, q5, q3                           @ AES final block - result
   5142 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
   5143 
   5144 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
   5145 .L256_enc_blocks_less_than_1:@ blocks left <= 1
   5146 
   5147 	and	r1, r1, #127                   @ bit_length %= 128
   5148 
   5149 	mvn	r13, xzr                                     @ rk14_l = 0xffffffffffffffff
   5150 	sub	r1, r1, #128                   @ bit_length -= 128
   5151 
   5152 	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
   5153 	ld1	{ v18.16b}, [r2]                           @ load existing bytes where the possibly partial last block is to be stored
   5154 
   5155 	mvn	r14, xzr                                     @ rk14_h = 0xffffffffffffffff
   5156 	and	r1, r1, #127                   @ bit_length %= 128
   5157 
   5158 	lsr	r14, r14, r1                    @ rk14_h is mask for top 64b of last block
   5159 	cmp	r1, #64
   5160 
   5161 	csel	r6, r13, r14, lt
   5162 	csel	r7, r14, xzr, lt
   5163 
   5164 	fmov	d0, r6                                @ ctr0b is mask for last block
   5165 
   5166 	fmov	v0.d[1], r7
   5167 
   5168 	and	q5, q5, q0                           @ possibly partial last block has zeroes in highest bits
   5169 
   5170 	rev64	q4, q5                                   @ GHASH final block
   5171 
   5172 	eor	q4, q4, q8                          @ feed in partial tag
   5173 
   5174 	bif	q5, v18.16b, q0                             @ insert existing bytes in top end of result before storing
   5175 
   5176 	pmull2	v20.1q, q4, v12.2d                         @ GHASH final block - high
   5177 	mov	d8, v4.d[1]                                 @ GHASH final block - mid
   5178 #ifndef __ARMEB__
   5179 	rev	r9, r12
   5180 #else
   5181 	mov	r9, r12
   5182 #endif
   5183 
   5184 	pmull	v21.1q, q4, v12.1d                         @ GHASH final block - low
   5185 
   5186 	eor	q9, q9, v20.16b                           @ GHASH final block - high
   5187 	eor	q8, q8, q4                         @ GHASH final block - mid
   5188 
   5189 	pmull	v8.1q, q8, v16.1d                         @ GHASH final block - mid
   5190 
   5191 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final block - low
   5192 
   5193 	eor	v10.16b, v10.16b, q8                        @ GHASH final block - mid
   5194 	movi	q8, #0xc2
   5195 
   5196 	eor	q4, v11.16b, q9                        @ MODULO - karatsuba tidy up
   5197 
   5198 	shl	d8, d8, #56              @ mod_constant
   5199 
   5200 	eor	v10.16b, v10.16b, q4                        @ MODULO - karatsuba tidy up
   5201 
   5202 	pmull	v7.1q, q9, q8           @ MODULO - top 64b align with mid
   5203 
   5204 	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
   5205 
   5206 	eor	v10.16b, v10.16b, q7                     @ MODULO - fold into mid
   5207 
   5208 	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
   5209 
   5210 	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
   5211 
   5212 	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
   5213 
   5214 	str	r9, [r16, #12]                         @ store the updated counter
   5215 
   5216 	st1	{ q5}, [r2]                         @ store all 16B
   5217 	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
   5218 
   5219 	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
   5220 	ext	v11.16b, v11.16b, v11.16b, #8
   5221 	rev64	v11.16b, v11.16b
   5222 	mov	r0, r15
   5223 	st1	{ v11.16b }, [r3]
   5224 
   5225 	ldp	r21, r22, [sp, #16]
   5226 	ldp	r23, r24, [sp, #32]
   5227 	ldp	d8, d9, [sp, #48]
   5228 	ldp	d10, d11, [sp, #64]
   5229 	ldp	d12, d13, [sp, #80]
   5230 	ldp	d14, d15, [sp, #96]
   5231 	ldp	r19, r20, [sp], #112
   5232 	RET
   5233 
   5234 .L256_enc_ret:
   5235 	mov	r0, #0x0
   5236 	RET
   5237 .size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
   5238 .globl	aes_gcm_dec_256_kernel
   5239 .type	aes_gcm_dec_256_kernel,%function
   5240 .align	4
   5241 aes_gcm_dec_256_kernel:
   5242 	cbz	r1, .L256_dec_ret
   5243 	stp	r19, r20, [sp, #-112]!
   5244 	mov	r16, r4
   5245 	mov	r8, r5
   5246 	stp	r21, r22, [sp, #16]
   5247 	stp	r23, r24, [sp, #32]
   5248 	stp	d8, d9, [sp, #48]
   5249 	stp	d10, d11, [sp, #64]
   5250 	stp	d12, d13, [sp, #80]
   5251 	stp	d14, d15, [sp, #96]
   5252 
   5253 	lsr	r5, r1, #3              @ byte_len
   5254 	mov	r15, r5
   5255 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   5256 #ifdef __ARMEB__
   5257 	rev	r10, r10
   5258 	rev	r11, r11
   5259 #endif
   5260 	ldp	r13, r14, [r8, #224]                     @ load rk14
   5261 #ifdef __ARMEB__
   5262 	ror	r14, r14, #32
   5263 	ror	r13, r13, #32
   5264 #endif
   5265 	ld1	{v18.4s}, [r8], #16                               @ load rk0
   5266 	sub	r5, r5, #1      @ byte_len - 1
   5267 
   5268 	ld1	{v19.4s}, [r8], #16                               @ load rk1
   5269 	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   5270 
   5271 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   5272 	ld1	{v20.4s}, [r8], #16                               @ load rk2
   5273 
   5274 	lsr	r12, r11, #32
   5275 	ld1	{v21.4s}, [r8], #16                               @ load rk3
   5276 	orr	r11, r11, r11
   5277 
   5278 	ld1	{v22.4s}, [r8], #16                               @ load rk4
   5279 	add	r5, r5, r0
   5280 	rev	r12, r12                                @ rev_ctr32
   5281 
   5282 	add	r12, r12, #1                            @ increment rev_ctr32
   5283 	fmov	d3, r10                               @ CTR block 3
   5284 
   5285 	rev	r9, r12                                 @ CTR block 1
   5286 	add	r12, r12, #1                            @ CTR block 1
   5287 	fmov	d1, r10                               @ CTR block 1
   5288 
   5289 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   5290 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   5291 
   5292 	fmov	v1.d[1], r9                               @ CTR block 1
   5293 	rev	r9, r12                                 @ CTR block 2
   5294 	add	r12, r12, #1                            @ CTR block 2
   5295 
   5296 	fmov	d2, r10                               @ CTR block 2
   5297 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   5298 
   5299 	fmov	v2.d[1], r9                               @ CTR block 2
   5300 	rev	r9, r12                                 @ CTR block 3
   5301 
   5302 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   5303 	ld1	{v23.4s}, [r8], #16                               @ load rk5
   5304 
   5305 	fmov	v3.d[1], r9                               @ CTR block 3
   5306 	add	r12, r12, #1                            @ CTR block 3
   5307 
   5308 	ld1	{v24.4s}, [r8], #16                               @ load rk6
   5309 
   5310 	ld1	{v25.4s}, [r8], #16                               @ load rk7
   5311 
   5312 	ld1	{v26.4s}, [r8], #16                               @ load rk8
   5313 
   5314 	aese	q0, v18.16b
   5315 	aesmc	q0, q0          @ AES block 0 - round 0
   5316 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   5317 #ifndef __ARMEB__
   5318 	ext	v14.16b, v14.16b, v14.16b, #8
   5319 #endif
   5320 
   5321 	aese	q3, v18.16b
   5322 	aesmc	q3, q3          @ AES block 3 - round 0
   5323 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   5324 #ifndef __ARMEB__
   5325 	ext	v15.16b, v15.16b, v15.16b, #8
   5326 #endif
   5327 
   5328 	aese	q1, v18.16b
   5329 	aesmc	q1, q1          @ AES block 1 - round 0
   5330 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   5331 #ifndef __ARMEB__
   5332 	ext	v13.16b, v13.16b, v13.16b, #8
   5333 #endif
   5334 
   5335 	aese	q2, v18.16b
   5336 	aesmc	q2, q2          @ AES block 2 - round 0
   5337 	ld1	{v27.4s}, [r8], #16                                 @ load rk9
   5338 
   5339 	aese	q0, v19.16b
   5340 	aesmc	q0, q0          @ AES block 0 - round 1
   5341 
   5342 	aese	q1, v19.16b
   5343 	aesmc	q1, q1          @ AES block 1 - round 1
   5344 	ld1	{ v11.16b}, [r3]
   5345 	ext	v11.16b, v11.16b, v11.16b, #8
   5346 	rev64	v11.16b, v11.16b
   5347 
   5348 	aese	q2, v19.16b
   5349 	aesmc	q2, q2          @ AES block 2 - round 1
   5350 	ld1	{v28.4s}, [r8], #16                              @ load rk10
   5351 
   5352 	aese	q3, v19.16b
   5353 	aesmc	q3, q3          @ AES block 3 - round 1
   5354 	ld1	{v29.4s}, [r8], #16                              @ load rk11
   5355 
   5356 	aese	q0, v20.16b
   5357 	aesmc	q0, q0          @ AES block 0 - round 2
   5358 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   5359 #ifndef __ARMEB__
   5360 	ext	v12.16b, v12.16b, v12.16b, #8
   5361 #endif
   5362 	aese	q2, v20.16b
   5363 	aesmc	q2, q2          @ AES block 2 - round 2
   5364 	ld1	{v30.4s}, [r8], #16                              @ load rk12
   5365 
   5366 	aese	q3, v20.16b
   5367 	aesmc	q3, q3          @ AES block 3 - round 2
   5368 
   5369 	aese	q0, v21.16b
   5370 	aesmc	q0, q0          @ AES block 0 - round 3
   5371 
   5372 	aese	q1, v20.16b
   5373 	aesmc	q1, q1          @ AES block 1 - round 2
   5374 
   5375 	aese	q3, v21.16b
   5376 	aesmc	q3, q3          @ AES block 3 - round 3
   5377 
   5378 	aese	q0, v22.16b
   5379 	aesmc	q0, q0          @ AES block 0 - round 4
   5380 	cmp	r0, r5                   @ check if we have <= 4 blocks
   5381 
   5382 	aese	q2, v21.16b
   5383 	aesmc	q2, q2          @ AES block 2 - round 3
   5384 
   5385 	aese	q1, v21.16b
   5386 	aesmc	q1, q1          @ AES block 1 - round 3
   5387 
   5388 	aese	q3, v22.16b
   5389 	aesmc	q3, q3          @ AES block 3 - round 4
   5390 
   5391 	aese	q2, v22.16b
   5392 	aesmc	q2, q2          @ AES block 2 - round 4
   5393 
   5394 	aese	q1, v22.16b
   5395 	aesmc	q1, q1          @ AES block 1 - round 4
   5396 
   5397 	aese	q3, v23.16b
   5398 	aesmc	q3, q3          @ AES block 3 - round 5
   5399 
   5400 	aese	q0, v23.16b
   5401 	aesmc	q0, q0          @ AES block 0 - round 5
   5402 
   5403 	aese	q1, v23.16b
   5404 	aesmc	q1, q1          @ AES block 1 - round 5
   5405 
   5406 	aese	q2, v23.16b
   5407 	aesmc	q2, q2          @ AES block 2 - round 5
   5408 
   5409 	aese	q0, v24.16b
   5410 	aesmc	q0, q0          @ AES block 0 - round 6
   5411 
   5412 	aese	q3, v24.16b
   5413 	aesmc	q3, q3          @ AES block 3 - round 6
   5414 
   5415 	aese	q1, v24.16b
   5416 	aesmc	q1, q1          @ AES block 1 - round 6
   5417 
   5418 	aese	q2, v24.16b
   5419 	aesmc	q2, q2          @ AES block 2 - round 6
   5420 
   5421 	aese	q0, v25.16b
   5422 	aesmc	q0, q0          @ AES block 0 - round 7
   5423 
   5424 	aese	q1, v25.16b
   5425 	aesmc	q1, q1          @ AES block 1 - round 7
   5426 
   5427 	aese	q3, v25.16b
   5428 	aesmc	q3, q3          @ AES block 3 - round 7
   5429 
   5430 	aese	q0, v26.16b
   5431 	aesmc	q0, q0          @ AES block 0 - round 8
   5432 
   5433 	aese	q2, v25.16b
   5434 	aesmc	q2, q2          @ AES block 2 - round 7
   5435 
   5436 	aese	q3, v26.16b
   5437 	aesmc	q3, q3          @ AES block 3 - round 8
   5438 
   5439 	aese	q1, v26.16b
   5440 	aesmc	q1, q1          @ AES block 1 - round 8
   5441 
   5442 	aese	q0, v27.16b
   5443 	aesmc	q0, q0          @ AES block 0 - round 9
   5444 
   5445 	aese	q2, v26.16b
   5446 	aesmc	q2, q2          @ AES block 2 - round 8
   5447 	ld1	{v31.4s}, [r8], #16                             @ load rk13
   5448 
   5449 	aese	q1, v27.16b
   5450 	aesmc	q1, q1          @ AES block 1 - round 9
   5451 
   5452 	aese	q0, v28.16b
   5453 	aesmc	q0, q0          @ AES block 0 - round 10
   5454 
   5455 	aese	q3, v27.16b
   5456 	aesmc	q3, q3          @ AES block 3 - round 9
   5457 
   5458 	aese	q1, v28.16b
   5459 	aesmc	q1, q1          @ AES block 1 - round 10
   5460 
   5461 	aese	q2, v27.16b
   5462 	aesmc	q2, q2          @ AES block 2 - round 9
   5463 
   5464 	aese	q3, v28.16b
   5465 	aesmc	q3, q3          @ AES block 3 - round 10
   5466 
   5467 	aese	q0, v29.16b
   5468 	aesmc	q0, q0          @ AES block 0 - round 11
   5469 
   5470 	aese	q2, v28.16b
   5471 	aesmc	q2, q2          @ AES block 2 - round 10
   5472 
   5473 	aese	q3, v29.16b
   5474 	aesmc	q3, q3          @ AES block 3 - round 11
   5475 
   5476 	aese	q1, v29.16b
   5477 	aesmc	q1, q1          @ AES block 1 - round 11
   5478 
   5479 	aese	q2, v29.16b
   5480 	aesmc	q2, q2          @ AES block 2 - round 11
   5481 
   5482 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   5483 
   5484 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   5485 
   5486 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   5487 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   5488 
   5489 	aese	q1, v30.16b
   5490 	aesmc	q1, q1          @ AES block 1 - round 12
   5491 
   5492 	aese	q0, v30.16b
   5493 	aesmc	q0, q0          @ AES block 0 - round 12
   5494 
   5495 	aese	q2, v30.16b
   5496 	aesmc	q2, q2          @ AES block 2 - round 12
   5497 
   5498 	aese	q3, v30.16b
   5499 	aesmc	q3, q3          @ AES block 3 - round 12
   5500 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   5501 
   5502 	aese	q1, v31.16b                                     @ AES block 1 - round 13
   5503 
   5504 	aese	q2, v31.16b                                     @ AES block 2 - round 13
   5505 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   5506 
   5507 	aese	q3, v31.16b                                     @ AES block 3 - round 13
   5508 
   5509 	aese	q0, v31.16b                                     @ AES block 0 - round 13
   5510 	bge	.L256_dec_tail                                    @ handle tail
   5511 
   5512 	ld1	{q4, q5}, [r0], #32               @ AES block 0,1 - load ciphertext
   5513 
   5514 	rev	r9, r12                                 @ CTR block 4
   5515 
   5516 	eor	q0, q4, q0                            @ AES block 0 - result
   5517 
   5518 	eor	q1, q5, q1                            @ AES block 1 - result
   5519 	rev64	q5, q5                                    @ GHASH block 1
   5520 	ld1	{q6}, [r0], #16                       @ AES block 2 - load ciphertext
   5521 
   5522 	mov	r7, v0.d[1]                            @ AES block 0 - mov high
   5523 
   5524 	mov	r6, v0.d[0]                            @ AES block 0 - mov low
   5525 	rev64	q4, q4                                    @ GHASH block 0
   5526 	add	r12, r12, #1                            @ CTR block 4
   5527 
   5528 	fmov	d0, r10                               @ CTR block 4
   5529 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   5530 
   5531 	fmov	v0.d[1], r9                               @ CTR block 4
   5532 	rev	r9, r12                                 @ CTR block 5
   5533 	add	r12, r12, #1                            @ CTR block 5
   5534 
   5535 	mov	r19, v1.d[0]                            @ AES block 1 - mov low
   5536 
   5537 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   5538 	mov	r20, v1.d[1]                            @ AES block 1 - mov high
   5539 	eor	r7, r7, r14                   @ AES block 0 - round 14 high
   5540 #ifdef __ARMEB__
   5541 	rev	r7, r7
   5542 #endif
   5543 	eor	r6, r6, r13                   @ AES block 0 - round 14 low
   5544 #ifdef __ARMEB__
   5545 	rev	r6, r6
   5546 #endif
   5547 	stp	r6, r7, [r2], #16        @ AES block 0 - store result
   5548 	fmov	d1, r10                               @ CTR block 5
   5549 
   5550 	ld1	{q7}, [r0], #16                       @ AES block 3 - load ciphertext
   5551 
   5552 	fmov	v1.d[1], r9                               @ CTR block 5
   5553 	rev	r9, r12                                 @ CTR block 6
   5554 	add	r12, r12, #1                            @ CTR block 6
   5555 
   5556 	eor	r19, r19, r13                   @ AES block 1 - round 14 low
   5557 #ifdef __ARMEB__
   5558 	rev	r19, r19
   5559 #endif
   5560 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   5561 
   5562 	eor	r20, r20, r14                   @ AES block 1 - round 14 high
   5563 #ifdef __ARMEB__
   5564 	rev	r20, r20
   5565 #endif
   5566 	stp	r19, r20, [r2], #16        @ AES block 1 - store result
   5567 
   5568 	eor	q2, q6, q2                            @ AES block 2 - result
   5569 	cmp	r0, r5                   @ check if we have <= 8 blocks
   5570 	bge	.L256_dec_prepretail                              @ do prepretail
   5571 
   5572 .L256_dec_main_loop:@ main loop start
   5573 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   5574 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   5575 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   5576 
   5577 	aese	q0, v18.16b
   5578 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   5579 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   5580 
   5581 	aese	q1, v18.16b
   5582 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   5583 	fmov	d2, r10                               @ CTR block 4k+6
   5584 
   5585 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   5586 	eor	q4, q4, v11.16b                           @ PRE 1
   5587 	rev	r9, r12                                 @ CTR block 4k+7
   5588 
   5589 	aese	q0, v19.16b
   5590 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   5591 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   5592 
   5593 	aese	q1, v19.16b
   5594 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   5595 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   5596 
   5597 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   5598 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   5599 	fmov	d3, r10                               @ CTR block 4k+7
   5600 
   5601 	aese	q0, v20.16b
   5602 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   5603 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   5604 
   5605 	aese	q2, v18.16b
   5606 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   5607 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   5608 
   5609 	aese	q1, v20.16b
   5610 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   5611 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   5612 
   5613 	aese	q0, v21.16b
   5614 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   5615 	eor	r22, r22, r14                   @ AES block 4k+2 - round 14 high
   5616 #ifdef __ARMEB__
   5617 	rev	r22, r22
   5618 #endif
   5619 	aese	q2, v19.16b
   5620 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   5621 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   5622 
   5623 	aese	q1, v21.16b
   5624 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   5625 	rev64	q6, q6                                    @ GHASH block 4k+2
   5626 
   5627 	aese	q3, v18.16b
   5628 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   5629 	eor	r21, r21, r13                   @ AES block 4k+2 - round 14 low
   5630 #ifdef __ARMEB__
   5631 	rev	r21, r21
   5632 #endif
   5633 	aese	q2, v20.16b
   5634 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   5635 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   5636 
   5637 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   5638 
   5639 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   5640 
   5641 	aese	q2, v21.16b
   5642 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   5643 	rev64	q7, q7                                    @ GHASH block 4k+3
   5644 
   5645 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   5646 	eor	r23, r23, r13                   @ AES block 4k+3 - round 14 low
   5647 #ifdef __ARMEB__
   5648 	rev	r23, r23
   5649 #endif
   5650 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   5651 	eor	r24, r24, r14                   @ AES block 4k+3 - round 14 high
   5652 #ifdef __ARMEB__
   5653 	rev	r24, r24
   5654 #endif
   5655 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   5656 
   5657 	aese	q2, v22.16b
   5658 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   5659 
   5660 	aese	q3, v19.16b
   5661 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   5662 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   5663 
   5664 	aese	q0, v22.16b
   5665 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   5666 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   5667 
   5668 	aese	q2, v23.16b
   5669 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   5670 	add	r12, r12, #1                            @ CTR block 4k+7
   5671 
   5672 	aese	q3, v20.16b
   5673 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   5674 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   5675 
   5676 	aese	q1, v22.16b
   5677 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   5678 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   5679 
   5680 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   5681 
   5682 	aese	q3, v21.16b
   5683 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   5684 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   5685 
   5686 	aese	q1, v23.16b
   5687 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   5688 
   5689 	aese	q0, v23.16b
   5690 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   5691 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   5692 
   5693 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   5694 	rev	r9, r12                                 @ CTR block 4k+8
   5695 
   5696 	aese	q1, v24.16b
   5697 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   5698 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   5699 
   5700 	aese	q0, v24.16b
   5701 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   5702 	add	r12, r12, #1                            @ CTR block 4k+8
   5703 
   5704 	aese	q3, v22.16b
   5705 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   5706 
   5707 	aese	q1, v25.16b
   5708 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   5709 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   5710 
   5711 	aese	q0, v25.16b
   5712 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   5713 
   5714 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   5715 	mov	d6, v7.d[1]                                  @ GHASH block 4k+3 - mid
   5716 
   5717 	aese	q3, v23.16b
   5718 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   5719 
   5720 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   5721 
   5722 	aese	q0, v26.16b
   5723 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   5724 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   5725 
   5726 	aese	q3, v24.16b
   5727 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   5728 
   5729 	pmull	v4.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   5730 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   5731 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   5732 
   5733 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   5734 
   5735 	aese	q0, v27.16b
   5736 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   5737 	eor	q6, q6, q7                          @ GHASH block 4k+3 - mid
   5738 
   5739 	aese	q1, v26.16b
   5740 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   5741 
   5742 	aese	q2, v24.16b
   5743 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   5744 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   5745 
   5746 	aese	q0, v28.16b
   5747 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   5748 
   5749 	pmull	v6.1q, q6, v16.1d                          @ GHASH block 4k+3 - mid
   5750 	movi	q8, #0xc2
   5751 
   5752 	aese	q2, v25.16b
   5753 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   5754 	eor	v11.16b, v11.16b, q4                         @ GHASH block 4k+3 - low
   5755 
   5756 	aese	q0, v29.16b
   5757 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   5758 
   5759 	aese	q3, v25.16b
   5760 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   5761 	shl	d8, d8, #56               @ mod_constant
   5762 
   5763 	aese	q2, v26.16b
   5764 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   5765 	eor	v10.16b, v10.16b, q6                         @ GHASH block 4k+3 - mid
   5766 
   5767 	aese	q0, v30.16b
   5768 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   5769 
   5770 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   5771 	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
   5772 
   5773 	aese	q1, v27.16b
   5774 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   5775 	ld1	{q4}, [r0], #16                       @ AES block 4k+4 - load ciphertext
   5776 
   5777 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   5778 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   5779 
   5780 	aese	q1, v28.16b
   5781 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   5782 	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
   5783 
   5784 	aese	q2, v27.16b
   5785 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   5786 	ld1	{q5}, [r0], #16                       @ AES block 4k+5 - load ciphertext
   5787 
   5788 	aese	q3, v26.16b
   5789 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   5790 	eor	q0, q4, q0                            @ AES block 4k+4 - result
   5791 
   5792 	aese	q1, v29.16b
   5793 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   5794 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   5795 
   5796 	aese	q2, v28.16b
   5797 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   5798 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   5799 
   5800 	aese	q3, v27.16b
   5801 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   5802 	ld1	{q6}, [r0], #16                       @ AES block 4k+6 - load ciphertext
   5803 
   5804 	aese	q1, v30.16b
   5805 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   5806 	ld1	{q7}, [r0], #16                       @ AES block 4k+7 - load ciphertext
   5807 
   5808 	aese	q2, v29.16b
   5809 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   5810 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   5811 
   5812 	aese	q3, v28.16b
   5813 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   5814 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   5815 
   5816 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   5817 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   5818 
   5819 	aese	q2, v30.16b
   5820 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   5821 	fmov	d0, r10                               @ CTR block 4k+8
   5822 
   5823 	aese	q3, v29.16b
   5824 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   5825 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   5826 
   5827 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   5828 	eor	q1, q5, q1                            @ AES block 4k+5 - result
   5829 	rev	r9, r12                                 @ CTR block 4k+9
   5830 
   5831 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   5832 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   5833 	cmp	r0, r5                   @ .LOOP CONTROL
   5834 
   5835 	add	r12, r12, #1                            @ CTR block 4k+9
   5836 
   5837 	eor	r6, r6, r13                   @ AES block 4k+4 - round 14 low
   5838 #ifdef __ARMEB__
   5839 	rev	r6, r6
   5840 #endif
   5841 	eor	r7, r7, r14                   @ AES block 4k+4 - round 14 high
   5842 #ifdef __ARMEB__
   5843 	rev	r7, r7
   5844 #endif
   5845 	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
   5846 	eor	q2, q6, q2                            @ AES block 4k+6 - result
   5847 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   5848 
   5849 	aese	q3, v30.16b
   5850 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   5851 	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
   5852 
   5853 	fmov	d1, r10                               @ CTR block 4k+9
   5854 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   5855 
   5856 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   5857 	rev	r9, r12                                 @ CTR block 4k+10
   5858 	add	r12, r12, #1                            @ CTR block 4k+10
   5859 
   5860 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   5861 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   5862 
   5863 	rev64	q5, q5                                    @ GHASH block 4k+5
   5864 	eor	r20, r20, r14                   @ AES block 4k+5 - round 14 high
   5865 #ifdef __ARMEB__
   5866 	rev	r20, r20
   5867 #endif
   5868 	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
   5869 
   5870 	eor	r19, r19, r13                   @ AES block 4k+5 - round 14 low
   5871 #ifdef __ARMEB__
   5872 	rev	r19, r19
   5873 #endif
   5874 	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
   5875 
   5876 	rev64	q4, q4                                    @ GHASH block 4k+4
   5877 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   5878 	blt	.L256_dec_main_loop
   5879 
   5880 
   5881 .L256_dec_prepretail:@ PREPRETAIL
   5882 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   5883 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   5884 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   5885 
   5886 	aese	q0, v18.16b
   5887 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   5888 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   5889 
   5890 	aese	q1, v18.16b
   5891 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   5892 	fmov	d2, r10                               @ CTR block 4k+6
   5893 
   5894 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   5895 	rev	r9, r12                                 @ CTR block 4k+7
   5896 	eor	q4, q4, v11.16b                           @ PRE 1
   5897 
   5898 	rev64	q6, q6                                    @ GHASH block 4k+2
   5899 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   5900 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   5901 
   5902 	aese	q1, v19.16b
   5903 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   5904 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   5905 
   5906 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   5907 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   5908 	fmov	d3, r10                               @ CTR block 4k+7
   5909 
   5910 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   5911 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   5912 
   5913 	aese	q2, v18.16b
   5914 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   5915 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   5916 
   5917 	aese	q0, v19.16b
   5918 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   5919 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   5920 
   5921 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   5922 
   5923 	aese	q2, v19.16b
   5924 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   5925 	rev64	q7, q7                                    @ GHASH block 4k+3
   5926 
   5927 	aese	q3, v18.16b
   5928 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   5929 
   5930 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   5931 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   5932 
   5933 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   5934 
   5935 	aese	q3, v19.16b
   5936 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   5937 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   5938 
   5939 	aese	q0, v20.16b
   5940 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   5941 
   5942 	aese	q1, v20.16b
   5943 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   5944 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   5945 
   5946 	aese	q2, v20.16b
   5947 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   5948 
   5949 	aese	q0, v21.16b
   5950 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   5951 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   5952 
   5953 	aese	q3, v20.16b
   5954 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   5955 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   5956 
   5957 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   5958 
   5959 	aese	q0, v22.16b
   5960 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   5961 
   5962 	aese	q3, v21.16b
   5963 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   5964 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   5965 
   5966 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   5967 
   5968 	aese	q0, v23.16b
   5969 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   5970 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   5971 
   5972 	aese	q3, v22.16b
   5973 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   5974 
   5975 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   5976 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   5977 
   5978 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   5979 
   5980 	aese	q3, v23.16b
   5981 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   5982 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   5983 
   5984 	aese	q2, v21.16b
   5985 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   5986 
   5987 	aese	q1, v21.16b
   5988 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   5989 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   5990 
   5991 	pmull	v4.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   5992 
   5993 	aese	q2, v22.16b
   5994 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   5995 	mov	d6, v7.d[1]                                  @ GHASH block 4k+3 - mid
   5996 
   5997 	aese	q1, v22.16b
   5998 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   5999 
   6000 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   6001 
   6002 	aese	q2, v23.16b
   6003 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   6004 	eor	q6, q6, q7                          @ GHASH block 4k+3 - mid
   6005 
   6006 	aese	q1, v23.16b
   6007 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   6008 
   6009 	aese	q3, v24.16b
   6010 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   6011 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   6012 
   6013 	aese	q2, v24.16b
   6014 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   6015 
   6016 	aese	q0, v24.16b
   6017 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   6018 	movi	q8, #0xc2
   6019 
   6020 	aese	q1, v24.16b
   6021 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   6022 	eor	v11.16b, v11.16b, q4                         @ GHASH block 4k+3 - low
   6023 
   6024 	pmull	v6.1q, q6, v16.1d                          @ GHASH block 4k+3 - mid
   6025 
   6026 	aese	q3, v25.16b
   6027 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   6028 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   6029 
   6030 	aese	q1, v25.16b
   6031 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   6032 
   6033 	aese	q0, v25.16b
   6034 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   6035 	eor	v10.16b, v10.16b, q6                         @ GHASH block 4k+3 - mid
   6036 
   6037 	aese	q3, v26.16b
   6038 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   6039 
   6040 	aese	q2, v25.16b
   6041 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   6042 	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
   6043 
   6044 	aese	q1, v26.16b
   6045 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   6046 
   6047 	aese	q0, v26.16b
   6048 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   6049 	shl	d8, d8, #56               @ mod_constant
   6050 
   6051 	aese	q2, v26.16b
   6052 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   6053 
   6054 	aese	q1, v27.16b
   6055 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   6056 	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
   6057 
   6058 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   6059 
   6060 	aese	q2, v27.16b
   6061 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   6062 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   6063 
   6064 	aese	q3, v27.16b
   6065 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   6066 
   6067 	aese	q0, v27.16b
   6068 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   6069 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   6070 
   6071 	aese	q2, v28.16b
   6072 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   6073 
   6074 	aese	q3, v28.16b
   6075 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   6076 
   6077 	aese	q0, v28.16b
   6078 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   6079 	eor	r22, r22, r14                   @ AES block 4k+2 - round 14 high
   6080 #ifdef __ARMEB__
   6081 	rev	r22, r22
   6082 #endif
   6083 	aese	q1, v28.16b
   6084 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   6085 	eor	r23, r23, r13                   @ AES block 4k+3 - round 14 low
   6086 #ifdef __ARMEB__
   6087 	rev	r23, r23
   6088 #endif
   6089 	aese	q2, v29.16b
   6090 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   6091 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   6092 
   6093 	aese	q0, v29.16b
   6094 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   6095 	add	r12, r12, #1                            @ CTR block 4k+7
   6096 
   6097 	aese	q1, v29.16b
   6098 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   6099 	eor	r21, r21, r13                   @ AES block 4k+2 - round 14 low
   6100 #ifdef __ARMEB__
   6101 	rev	r21, r21
   6102 #endif
   6103 
   6104 	aese	q2, v30.16b
   6105 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   6106 
   6107 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   6108 	eor	r24, r24, r14                   @ AES block 4k+3 - round 14 high
   6109 #ifdef __ARMEB__
   6110 	rev	r24, r24
   6111 #endif
   6112 
   6113 	aese	q3, v29.16b
   6114 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   6115 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   6116 
   6117 	aese	q1, v30.16b
   6118 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   6119 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   6120 
   6121 	aese	q0, v30.16b
   6122 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   6123 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   6124 
   6125 	aese	q3, v30.16b
   6126 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   6127 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   6128 
   6129 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   6130 
   6131 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   6132 
   6133 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   6134 
   6135 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   6136 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   6137 .L256_dec_tail:@ TAIL
   6138 
   6139 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   6140 	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
   6141 
   6142 	eor	q0, q5, q0                            @ AES block 4k+4 - result
   6143 
   6144 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   6145 
   6146 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   6147 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   6148 
   6149 	cmp	r5, #48
   6150 
   6151 	eor	r6, r6, r13                   @ AES block 4k+4 - round 14 low
   6152 #ifdef __ARMEB__
   6153 	rev	r6, r6
   6154 #endif
   6155 
   6156 	eor	r7, r7, r14                   @ AES block 4k+4 - round 14 high
   6157 #ifdef __ARMEB__
   6158 	rev	r7, r7
   6159 #endif
   6160 	bgt	.L256_dec_blocks_more_than_3
   6161 
   6162 	sub	r12, r12, #1
   6163 	mov	q3, q2
   6164 	movi	v10.8b, #0
   6165 
   6166 	movi	v11.8b, #0
   6167 	cmp	r5, #32
   6168 
   6169 	movi	q9, #0
   6170 	mov	q2, q1
   6171 	bgt	.L256_dec_blocks_more_than_2
   6172 
   6173 	sub	r12, r12, #1
   6174 
   6175 	mov	q3, q1
   6176 	cmp	r5, #16
   6177 	bgt	.L256_dec_blocks_more_than_1
   6178 
   6179 	sub	r12, r12, #1
   6180 	b	.L256_dec_blocks_less_than_1
   6181 .L256_dec_blocks_more_than_3:@ blocks left >  3
   6182 	rev64	q4, q5                                   @ GHASH final-3 block
   6183 	ld1	{ q5}, [r0], #16                     @ AES final-2 block - load ciphertext
   6184 
   6185 	stp	r6, r7, [r2], #16       @ AES final-3 block  - store result
   6186 
   6187 	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
   6188 
   6189 	eor	q4, q4, q8                          @ feed in partial tag
   6190 
   6191 	eor	q0, q5, q1                           @ AES final-2 block - result
   6192 
   6193 	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
   6194 
   6195 	mov	r6, v0.d[0]                           @ AES final-2 block - mov low
   6196 
   6197 	mov	r7, v0.d[1]                           @ AES final-2 block - mov high
   6198 
   6199 	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
   6200 
   6201 	movi	q8, #0                                       @ suppress further partial tag feed in
   6202 
   6203 	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
   6204 
   6205 	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
   6206 	eor	r6, r6, r13                  @ AES final-2 block - round 14 low
   6207 #ifdef __ARMEB__
   6208 	rev	r6, r6
   6209 #endif
   6210 
   6211 	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
   6212 	eor	r7, r7, r14                  @ AES final-2 block - round 14 high
   6213 #ifdef __ARMEB__
   6214 	rev	r7, r7
   6215 #endif
   6216 .L256_dec_blocks_more_than_2:@ blocks left >  2
   6217 
   6218 	rev64	q4, q5                                   @ GHASH final-2 block
   6219 	ld1	{ q5}, [r0], #16                     @ AES final-1 block - load ciphertext
   6220 
   6221 	eor	q4, q4, q8                          @ feed in partial tag
   6222 	stp	r6, r7, [r2], #16       @ AES final-2 block  - store result
   6223 
   6224 	eor	q0, q5, q2                           @ AES final-1 block - result
   6225 
   6226 	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
   6227 
   6228 	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
   6229 
   6230 	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
   6231 
   6232 	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
   6233 	mov	r6, v0.d[0]                           @ AES final-1 block - mov low
   6234 
   6235 	mov	r7, v0.d[1]                           @ AES final-1 block - mov high
   6236 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
   6237 	movi	q8, #0                                       @ suppress further partial tag feed in
   6238 
   6239 	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
   6240 
   6241 	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
   6242 	eor	r6, r6, r13                  @ AES final-1 block - round 14 low
   6243 #ifdef __ARMEB__
   6244 	rev	r6, r6
   6245 #endif
   6246 
   6247 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
   6248 	eor	r7, r7, r14                  @ AES final-1 block - round 14 high
   6249 #ifdef __ARMEB__
   6250 	rev	r7, r7
   6251 #endif
   6252 .L256_dec_blocks_more_than_1:@ blocks left >  1
   6253 
   6254 	stp	r6, r7, [r2], #16       @ AES final-1 block  - store result
   6255 	rev64	q4, q5                                   @ GHASH final-1 block
   6256 
   6257 	ld1	{ q5}, [r0], #16                     @ AES final block - load ciphertext
   6258 
   6259 	eor	q4, q4, q8                          @ feed in partial tag
   6260 	movi	q8, #0                                       @ suppress further partial tag feed in
   6261 
   6262 	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
   6263 
   6264 	eor	q0, q5, q3                           @ AES final block - result
   6265 
   6266 	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
   6267 
   6268 	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
   6269 
   6270 	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
   6271 	mov	r6, v0.d[0]                           @ AES final block - mov low
   6272 
   6273 	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
   6274 
   6275 	mov	r7, v0.d[1]                           @ AES final block - mov high
   6276 
   6277 	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
   6278 	eor	r6, r6, r13                  @ AES final block - round 14 low
   6279 #ifdef __ARMEB__
   6280 	rev	r6, r6
   6281 #endif
   6282 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
   6283 
   6284 	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
   6285 
   6286 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
   6287 	eor	r7, r7, r14                  @ AES final block - round 14 high
   6288 #ifdef __ARMEB__
   6289 	rev	r7, r7
   6290 #endif
   6291 .L256_dec_blocks_less_than_1:@ blocks left <= 1
   6292 
   6293 	and	r1, r1, #127                   @ bit_length %= 128
   6294 	mvn	r14, xzr                                     @ rk14_h = 0xffffffffffffffff
   6295 
   6296 	sub	r1, r1, #128                   @ bit_length -= 128
   6297 	mvn	r13, xzr                                     @ rk14_l = 0xffffffffffffffff
   6298 
   6299 	ldp	r4, r5, [r2] @ load existing bytes we need to not overwrite
   6300 	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
   6301 
   6302 	and	r1, r1, #127                   @ bit_length %= 128
   6303 
   6304 	lsr	r14, r14, r1                    @ rk14_h is mask for top 64b of last block
   6305 	cmp	r1, #64
   6306 
   6307 	csel	r9, r13, r14, lt
   6308 	csel	r10, r14, xzr, lt
   6309 
   6310 	fmov	d0, r9                                  @ ctr0b is mask for last block
   6311 	and	r6, r6, r9
   6312 
   6313 	mov	v0.d[1], r10
   6314 	bic	r4, r4, r9          @ mask out low existing bytes
   6315 
   6316 #ifndef __ARMEB__
   6317 	rev	r9, r12
   6318 #else
   6319 	mov	r9, r12
   6320 #endif
   6321 
   6322 	bic	r5, r5, r10      @ mask out high existing bytes
   6323 
   6324 	orr	r6, r6, r4
   6325 
   6326 	and	r7, r7, r10
   6327 
   6328 	orr	r7, r7, r5
   6329 
   6330 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
   6331 
   6332 	rev64	q4, q5                                    @ GHASH final block
   6333 
   6334 	eor	q4, q4, q8                           @ feed in partial tag
   6335 
   6336 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
   6337 
   6338 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
   6339 
   6340 	eor	q8, q8, q4                          @ GHASH final block - mid
   6341 
   6342 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
   6343 
   6344 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
   6345 
   6346 	eor	q9, q9, v20.16b                            @ GHASH final block - high
   6347 
   6348 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
   6349 
   6350 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
   6351 	movi	q8, #0xc2
   6352 
   6353 	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
   6354 
   6355 	shl	d8, d8, #56               @ mod_constant
   6356 
   6357 	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
   6358 
   6359 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   6360 
   6361 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   6362 
   6363 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   6364 
   6365 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   6366 
   6367 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   6368 
   6369 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   6370 
   6371 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   6372 
   6373 	stp	r6, r7, [r2]
   6374 
   6375 	str	r9, [r16, #12]                          @ store the updated counter
   6376 
   6377 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   6378 	ext	v11.16b, v11.16b, v11.16b, #8
   6379 	rev64	v11.16b, v11.16b
   6380 	mov	r0, r15
   6381 	st1	{ v11.16b }, [r3]
   6382 
   6383 	ldp	r21, r22, [sp, #16]
   6384 	ldp	r23, r24, [sp, #32]
   6385 	ldp	d8, d9, [sp, #48]
   6386 	ldp	d10, d11, [sp, #64]
   6387 	ldp	d12, d13, [sp, #80]
   6388 	ldp	d14, d15, [sp, #96]
   6389 	ldp	r19, r20, [sp], #112
   6390 	RET
   6391 
   6392 .L256_dec_ret:
   6393 	mov	r0, #0x0
   6394 	RET
   6395 .size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
   6396 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   6397 .align	2
   6398 .align	2
   6399 #endif
   6400