Home | History | Annotate | Line # | Download | only in arm
      1 #include "arm_asm.h"
      2 #include "arm_arch.h"
      3 
      4 #if __ARM_MAX_ARCH__>=8
      5 .fpu	neon
      6 #ifdef __thumb2__
      7 .syntax	unified
      8 .thumb
      9 # define INST(a,b,c,d)   c,0xef,a,b
     10 #else
     11 .code	32
     12 # define INST(a,b,c,d)   a,b,c,0xf2
     13 #endif
     14 
     15 .text
     16 .globl	aes_gcm_enc_128_kernel
     17 .type	aes_gcm_enc_128_kernel,%function
     18 .align	4
     19 aes_gcm_enc_128_kernel:
     20 	AARCH64_VALID_CALL_TARGET
     21 	cbz	r1, .L128_enc_ret
     22 	stp	r19, r20, [sp, #-112]!
     23 	mov	r16, r4
     24 	mov	r8, r5
     25 	stp	r21, r22, [sp, #16]
     26 	stp	r23, r24, [sp, #32]
     27 	stp	d8, d9, [sp, #48]
     28 	stp	d10, d11, [sp, #64]
     29 	stp	d12, d13, [sp, #80]
     30 	stp	d14, d15, [sp, #96]
     31 
     32 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
     33 #ifdef __ARMEB__
     34 	rev	r10, r10
     35 	rev	r11, r11
     36 #endif
     37 	ldp	r13, r14, [r8, #160]                     @ load rk10
     38 #ifdef __ARMEB__
     39 	ror	r13, r13, #32
     40 	ror	r14, r14, #32
     41 #endif
     42 	ld1	{v11.16b}, [r3]
     43 	ext	v11.16b, v11.16b, v11.16b, #8
     44 	rev64	v11.16b, v11.16b
     45 	lsr	r5, r1, #3              @ byte_len
     46 	mov	r15, r5
     47 
     48 	ld1	{v18.4s}, [r8], #16								  @ load rk0
     49 	add	r4, r0, r1, lsr #3   @ end_input_ptr
     50 	sub	r5, r5, #1      @ byte_len - 1
     51 
     52 	lsr	r12, r11, #32
     53 	ldr	q15, [r3, #112]                        @ load h4l | h4h
     54 #ifndef __ARMEB__
     55 	ext	v15.16b, v15.16b, v15.16b, #8
     56 #endif
     57 	fmov	d1, r10                               @ CTR block 1
     58 	rev	r12, r12                                @ rev_ctr32
     59 
     60 	add	r12, r12, #1                            @ increment rev_ctr32
     61 	orr	r11, r11, r11
     62 	ld1	{v19.4s}, [r8], #16								  @ load rk1
     63 
     64 	rev	r9, r12                                 @ CTR block 1
     65 	add	r12, r12, #1                            @ CTR block 1
     66 	fmov	d3, r10                               @ CTR block 3
     67 
     68 	orr	r9, r11, r9, lsl #32            @ CTR block 1
     69 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
     70 
     71 	fmov	v1.d[1], r9                               @ CTR block 1
     72 	rev	r9, r12                                 @ CTR block 2
     73 
     74 	fmov	d2, r10                               @ CTR block 2
     75 	orr	r9, r11, r9, lsl #32            @ CTR block 2
     76 	add	r12, r12, #1                            @ CTR block 2
     77 
     78 	fmov	v2.d[1], r9                               @ CTR block 2
     79 	rev	r9, r12                                 @ CTR block 3
     80 
     81 	orr	r9, r11, r9, lsl #32            @ CTR block 3
     82 	ld1	{v20.4s}, [r8], #16								  @ load rk2
     83 
     84 	add	r12, r12, #1                            @ CTR block 3
     85 	fmov	v3.d[1], r9                               @ CTR block 3
     86 
     87 	ldr	q14, [r3, #80]                         @ load h3l | h3h
     88 #ifndef __ARMEB__
     89 	ext	v14.16b, v14.16b, v14.16b, #8
     90 #endif
     91 	aese	q1, v18.16b
     92 	aesmc	q1, q1          @ AES block 1 - round 0
     93 	ld1	{v21.4s}, [r8], #16								  @ load rk3
     94 
     95 	aese	q2, v18.16b
     96 	aesmc	q2, q2          @ AES block 2 - round 0
     97 	ldr	q12, [r3, #32]                         @ load h1l | h1h
     98 #ifndef __ARMEB__
     99 	ext	v12.16b, v12.16b, v12.16b, #8
    100 #endif
    101 
    102 	aese	q0, v18.16b
    103 	aesmc	q0, q0          @ AES block 0 - round 0
    104 	ld1	{v22.4s}, [r8], #16								  @ load rk4
    105 
    106 	aese	q3, v18.16b
    107 	aesmc	q3, q3          @ AES block 3 - round 0
    108 	ld1	{v23.4s}, [r8], #16								  @ load rk5
    109 
    110 	aese	q2, v19.16b
    111 	aesmc	q2, q2          @ AES block 2 - round 1
    112 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
    113 
    114 	aese	q0, v19.16b
    115 	aesmc	q0, q0          @ AES block 0 - round 1
    116 	ld1	{v24.4s}, [r8], #16								  @ load rk6
    117 
    118 	aese	q1, v19.16b
    119 	aesmc	q1, q1          @ AES block 1 - round 1
    120 	ld1	{v25.4s}, [r8], #16								  @ load rk7
    121 
    122 	aese	q3, v19.16b
    123 	aesmc	q3, q3          @ AES block 3 - round 1
    124 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
    125 
    126 	aese	q0, v20.16b
    127 	aesmc	q0, q0          @ AES block 0 - round 2
    128 	ld1	{v26.4s}, [r8], #16								  @ load rk8
    129 
    130 	aese	q1, v20.16b
    131 	aesmc	q1, q1          @ AES block 1 - round 2
    132 	ldr	q13, [r3, #64]                         @ load h2l | h2h
    133 #ifndef __ARMEB__
    134 	ext	v13.16b, v13.16b, v13.16b, #8
    135 #endif
    136 
    137 	aese	q3, v20.16b
    138 	aesmc	q3, q3          @ AES block 3 - round 2
    139 
    140 	aese	q2, v20.16b
    141 	aesmc	q2, q2          @ AES block 2 - round 2
    142 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
    143 
    144 	aese	q0, v21.16b
    145 	aesmc	q0, q0          @ AES block 0 - round 3
    146 
    147 	aese	q1, v21.16b
    148 	aesmc	q1, q1          @ AES block 1 - round 3
    149 
    150 	aese	q2, v21.16b
    151 	aesmc	q2, q2          @ AES block 2 - round 3
    152 	ld1	{v27.4s}, [r8], #16								  @ load rk9
    153 
    154 	aese	q3, v21.16b
    155 	aesmc	q3, q3          @ AES block 3 - round 3
    156 
    157 	and	r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
    158 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
    159 
    160 	aese	q3, v22.16b
    161 	aesmc	q3, q3          @ AES block 3 - round 4
    162 	add	r5, r5, r0
    163 
    164 	aese	q2, v22.16b
    165 	aesmc	q2, q2          @ AES block 2 - round 4
    166 	cmp	r0, r5                   @ check if we have <= 4 blocks
    167 
    168 	aese	q0, v22.16b
    169 	aesmc	q0, q0          @ AES block 0 - round 4
    170 
    171 	aese	q3, v23.16b
    172 	aesmc	q3, q3          @ AES block 3 - round 5
    173 
    174 	aese	q2, v23.16b
    175 	aesmc	q2, q2          @ AES block 2 - round 5
    176 
    177 	aese	q0, v23.16b
    178 	aesmc	q0, q0          @ AES block 0 - round 5
    179 
    180 	aese	q3, v24.16b
    181 	aesmc	q3, q3          @ AES block 3 - round 6
    182 
    183 	aese	q1, v22.16b
    184 	aesmc	q1, q1          @ AES block 1 - round 4
    185 
    186 	aese	q2, v24.16b
    187 	aesmc	q2, q2          @ AES block 2 - round 6
    188 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
    189 
    190 	aese	q0, v24.16b
    191 	aesmc	q0, q0          @ AES block 0 - round 6
    192 
    193 	aese	q1, v23.16b
    194 	aesmc	q1, q1          @ AES block 1 - round 5
    195 
    196 	aese	q3, v25.16b
    197 	aesmc	q3, q3          @ AES block 3 - round 7
    198 
    199 	aese	q0, v25.16b
    200 	aesmc	q0, q0          @ AES block 0 - round 7
    201 
    202 	aese	q1, v24.16b
    203 	aesmc	q1, q1          @ AES block 1 - round 6
    204 
    205 	aese	q2, v25.16b
    206 	aesmc	q2, q2          @ AES block 2 - round 7
    207 
    208 	aese	q0, v26.16b
    209 	aesmc	q0, q0          @ AES block 0 - round 8
    210 
    211 	aese	q1, v25.16b
    212 	aesmc	q1, q1          @ AES block 1 - round 7
    213 
    214 	aese	q2, v26.16b
    215 	aesmc	q2, q2          @ AES block 2 - round 8
    216 
    217 	aese	q3, v26.16b
    218 	aesmc	q3, q3          @ AES block 3 - round 8
    219 
    220 	aese	q1, v26.16b
    221 	aesmc	q1, q1          @ AES block 1 - round 8
    222 
    223 	aese	q2, v27.16b                                      @ AES block 2 - round 9
    224 
    225 	aese	q0, v27.16b                                      @ AES block 0 - round 9
    226 
    227 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
    228 
    229 	aese	q1, v27.16b                                      @ AES block 1 - round 9
    230 
    231 	aese	q3, v27.16b                                      @ AES block 3 - round 9
    232 	bge	.L128_enc_tail                                    @ handle tail
    233 
    234 	ldp	r6, r7, [r0, #0]            @ AES block 0 - load plaintext
    235 #ifdef __ARMEB__
    236 	rev	r6, r6
    237 	rev	r7, r7
    238 #endif
    239 	ldp	r21, r22, [r0, #32]           @ AES block 2 - load plaintext
    240 #ifdef __ARMEB__
    241 	rev	r21, r21
    242 	rev	r22, r22
    243 #endif
    244 	ldp	r19, r20, [r0, #16]           @ AES block 1 - load plaintext
    245 #ifdef __ARMEB__
    246 	rev	r19, r19
    247 	rev	r20, r20
    248 #endif
    249 	ldp	r23, r24, [r0, #48]           @ AES block 3 - load plaintext
    250 #ifdef __ARMEB__
    251 	rev	r23, r23
    252 	rev	r24, r24
    253 #endif
    254 	eor	r6, r6, r13                     @ AES block 0 - round 10 low
    255 	eor	r7, r7, r14                     @ AES block 0 - round 10 high
    256 
    257 	eor	r21, r21, r13                     @ AES block 2 - round 10 low
    258 	fmov	d4, r6                               @ AES block 0 - mov low
    259 
    260 	eor	r19, r19, r13                     @ AES block 1 - round 10 low
    261 	eor	r22, r22, r14                     @ AES block 2 - round 10 high
    262 	fmov	v4.d[1], r7                           @ AES block 0 - mov high
    263 
    264 	fmov	d5, r19                               @ AES block 1 - mov low
    265 	eor	r20, r20, r14                     @ AES block 1 - round 10 high
    266 
    267 	eor	r23, r23, r13                     @ AES block 3 - round 10 low
    268 	fmov	v5.d[1], r20                           @ AES block 1 - mov high
    269 
    270 	fmov	d6, r21                               @ AES block 2 - mov low
    271 	eor	r24, r24, r14                     @ AES block 3 - round 10 high
    272 	rev	r9, r12                                 @ CTR block 4
    273 
    274 	fmov	v6.d[1], r22                           @ AES block 2 - mov high
    275 	orr	r9, r11, r9, lsl #32            @ CTR block 4
    276 
    277 	eor	q4, q4, q0                          @ AES block 0 - result
    278 	fmov	d0, r10                               @ CTR block 4
    279 	add	r12, r12, #1                            @ CTR block 4
    280 
    281 	fmov	v0.d[1], r9                               @ CTR block 4
    282 	rev	r9, r12                                 @ CTR block 5
    283 
    284 	eor	q5, q5, q1                          @ AES block 1 - result
    285 	fmov	d1, r10                               @ CTR block 5
    286 	orr	r9, r11, r9, lsl #32            @ CTR block 5
    287 
    288 	add	r12, r12, #1                            @ CTR block 5
    289 	add	r0, r0, #64                       @ AES input_ptr update
    290 	fmov	v1.d[1], r9                               @ CTR block 5
    291 
    292 	fmov	d7, r23                               @ AES block 3 - mov low
    293 	rev	r9, r12                                 @ CTR block 6
    294 	st1	{ q4}, [r2], #16                     @ AES block 0 - store result
    295 
    296 	fmov	v7.d[1], r24                           @ AES block 3 - mov high
    297 	orr	r9, r11, r9, lsl #32            @ CTR block 6
    298 
    299 	add	r12, r12, #1                            @ CTR block 6
    300 	eor	q6, q6, q2                          @ AES block 2 - result
    301 	st1	{ q5}, [r2], #16                     @ AES block 1 - store result
    302 
    303 	fmov	d2, r10                               @ CTR block 6
    304 	cmp	r0, r5                   @ check if we have <= 8 blocks
    305 
    306 	fmov	v2.d[1], r9                               @ CTR block 6
    307 	rev	r9, r12                                 @ CTR block 7
    308 	st1	{ q6}, [r2], #16                     @ AES block 2 - store result
    309 
    310 	orr	r9, r11, r9, lsl #32            @ CTR block 7
    311 
    312 	eor	q7, q7, q3                          @ AES block 3 - result
    313 	st1	{ q7}, [r2], #16                     @ AES block 3 - store result
    314 	bge	.L128_enc_prepretail                              @ do prepretail
    315 
    316 .L128_enc_main_loop:@ main loop start
    317 	ldp	r23, r24, [r0, #48]           @ AES block 4k+3 - load plaintext
    318 #ifdef __ARMEB__
    319 	rev	r23, r23
    320 	rev	r24, r24
    321 #endif
    322 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
    323 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
    324 
    325 	aese	q2, v18.16b
    326 	aesmc	q2, q2          @ AES block 4k+6 - round 0
    327 	fmov	d3, r10                               @ CTR block 4k+3
    328 
    329 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
    330 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
    331 
    332 	aese	q1, v18.16b
    333 	aesmc	q1, q1          @ AES block 4k+5 - round 0
    334 	add	r12, r12, #1                            @ CTR block 4k+3
    335 	fmov	v3.d[1], r9                               @ CTR block 4k+3
    336 
    337 	aese	q0, v18.16b
    338 	aesmc	q0, q0          @ AES block 4k+4 - round 0
    339 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
    340 
    341 	aese	q2, v19.16b
    342 	aesmc	q2, q2          @ AES block 4k+6 - round 1
    343 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
    344 
    345 	aese	q1, v19.16b
    346 	aesmc	q1, q1          @ AES block 4k+5 - round 1
    347 	eor	q4, q4, v11.16b                           @ PRE 1
    348 
    349 	aese	q3, v18.16b
    350 	aesmc	q3, q3          @ AES block 4k+7 - round 0
    351 	eor	r24, r24, r14                     @ AES block 4k+3 - round 10 high
    352 
    353 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
    354 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
    355 	ldp	r6, r7, [r0, #0]            @ AES block 4k+4 - load plaintext
    356 #ifdef __ARMEB__
    357 	rev	r6, r6
    358 	rev	r7, r7
    359 #endif
    360 	aese	q0, v19.16b
    361 	aesmc	q0, q0          @ AES block 4k+4 - round 1
    362 	rev	r9, r12                                 @ CTR block 4k+8
    363 
    364 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
    365 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
    366 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
    367 
    368 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
    369 	add	r12, r12, #1                            @ CTR block 4k+8
    370 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
    371 
    372 	aese	q0, v20.16b
    373 	aesmc	q0, q0          @ AES block 4k+4 - round 2
    374 
    375 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
    376 	eor	q8, q8, q4                          @ GHASH block 4k - mid
    377 
    378 	aese	q1, v20.16b
    379 	aesmc	q1, q1          @ AES block 4k+5 - round 2
    380 
    381 	aese	q0, v21.16b
    382 	aesmc	q0, q0          @ AES block 4k+4 - round 3
    383 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
    384 
    385 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
    386 
    387 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
    388 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
    389 
    390 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
    391 
    392 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
    393 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
    394 
    395 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
    396 	eor	r7, r7, r14                     @ AES block 4k+4 - round 10 high
    397 
    398 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
    399 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
    400 
    401 	aese	q3, v19.16b
    402 	aesmc	q3, q3          @ AES block 4k+7 - round 1
    403 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
    404 
    405 	aese	q2, v20.16b
    406 	aesmc	q2, q2          @ AES block 4k+6 - round 2
    407 	eor	r6, r6, r13                     @ AES block 4k+4 - round 10 low
    408 
    409 	aese	q1, v21.16b
    410 	aesmc	q1, q1          @ AES block 4k+5 - round 3
    411 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
    412 
    413 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
    414 
    415 	aese	q2, v21.16b
    416 	aesmc	q2, q2          @ AES block 4k+6 - round 3
    417 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
    418 
    419 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
    420 
    421 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
    422 	movi	q8, #0xc2
    423 
    424 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
    425 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
    426 
    427 	aese	q1, v22.16b
    428 	aesmc	q1, q1          @ AES block 4k+5 - round 4
    429 
    430 	aese	q3, v20.16b
    431 	aesmc	q3, q3          @ AES block 4k+7 - round 2
    432 	shl	d8, d8, #56               @ mod_constant
    433 
    434 	aese	q0, v22.16b
    435 	aesmc	q0, q0          @ AES block 4k+4 - round 4
    436 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
    437 
    438 	aese	q1, v23.16b
    439 	aesmc	q1, q1          @ AES block 4k+5 - round 5
    440 	ldp	r19, r20, [r0, #16]           @ AES block 4k+5 - load plaintext
    441 #ifdef __ARMEB__
    442 	rev	r19, r19
    443 	rev	r20, r20
    444 #endif
    445 	aese	q3, v21.16b
    446 	aesmc	q3, q3          @ AES block 4k+7 - round 3
    447 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
    448 
    449 	aese	q0, v23.16b
    450 	aesmc	q0, q0          @ AES block 4k+4 - round 5
    451 	ldp	r21, r22, [r0, #32]           @ AES block 4k+6 - load plaintext
    452 #ifdef __ARMEB__
    453 	rev	r21, r21
    454 	rev	r22, r22
    455 #endif
    456 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
    457 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
    458 
    459 	aese	q2, v22.16b
    460 	aesmc	q2, q2          @ AES block 4k+6 - round 4
    461 	eor	r19, r19, r13                     @ AES block 4k+5 - round 10 low
    462 
    463 	aese	q3, v22.16b
    464 	aesmc	q3, q3          @ AES block 4k+7 - round 4
    465 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
    466 
    467 	aese	q1, v24.16b
    468 	aesmc	q1, q1          @ AES block 4k+5 - round 6
    469 	eor	r23, r23, r13                     @ AES block 4k+3 - round 10 low
    470 
    471 	aese	q2, v23.16b
    472 	aesmc	q2, q2          @ AES block 4k+6 - round 5
    473 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
    474 
    475 	fmov	d4, r6                               @ AES block 4k+4 - mov low
    476 	aese	q0, v24.16b
    477 	aesmc	q0, q0          @ AES block 4k+4 - round 6
    478 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
    479 
    480 	add	r0, r0, #64                       @ AES input_ptr update
    481 	fmov	d7, r23                               @ AES block 4k+3 - mov low
    482 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
    483 
    484 	aese	q3, v23.16b
    485 	aesmc	q3, q3          @ AES block 4k+7 - round 5
    486 	fmov	d5, r19                               @ AES block 4k+5 - mov low
    487 
    488 	aese	q0, v25.16b
    489 	aesmc	q0, q0          @ AES block 4k+4 - round 7
    490 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
    491 
    492 	aese	q2, v24.16b
    493 	aesmc	q2, q2          @ AES block 4k+6 - round 6
    494 	eor	r20, r20, r14                     @ AES block 4k+5 - round 10 high
    495 
    496 	aese	q1, v25.16b
    497 	aesmc	q1, q1          @ AES block 4k+5 - round 7
    498 	fmov	v5.d[1], r20                           @ AES block 4k+5 - mov high
    499 
    500 	aese	q0, v26.16b
    501 	aesmc	q0, q0          @ AES block 4k+4 - round 8
    502 	fmov	v7.d[1], r24                           @ AES block 4k+3 - mov high
    503 
    504 	aese	q3, v24.16b
    505 	aesmc	q3, q3          @ AES block 4k+7 - round 6
    506 	cmp	r0, r5                   @ .LOOP CONTROL
    507 
    508 	aese	q1, v26.16b
    509 	aesmc	q1, q1          @ AES block 4k+5 - round 8
    510 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
    511 
    512 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
    513 	eor	r21, r21, r13                     @ AES block 4k+6 - round 10 low
    514 	eor	r22, r22, r14                     @ AES block 4k+6 - round 10 high
    515 
    516 	aese	q3, v25.16b
    517 	aesmc	q3, q3          @ AES block 4k+7 - round 7
    518 	fmov	d6, r21                               @ AES block 4k+6 - mov low
    519 
    520 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
    521 	fmov	v6.d[1], r22                           @ AES block 4k+6 - mov high
    522 
    523 	aese	q2, v25.16b
    524 	aesmc	q2, q2          @ AES block 4k+6 - round 7
    525 	eor	q4, q4, q0                          @ AES block 4k+4 - result
    526 
    527 	fmov	d0, r10                               @ CTR block 4k+8
    528 	aese	q3, v26.16b
    529 	aesmc	q3, q3          @ AES block 4k+7 - round 8
    530 
    531 	fmov	v0.d[1], r9                               @ CTR block 4k+8
    532 	rev	r9, r12                                 @ CTR block 4k+9
    533 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
    534 
    535 	aese	q2, v26.16b
    536 	aesmc	q2, q2          @ AES block 4k+6 - round 8
    537 	eor	q5, q5, q1                          @ AES block 4k+5 - result
    538 
    539 	add	r12, r12, #1                            @ CTR block 4k+9
    540 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
    541 	fmov	d1, r10                               @ CTR block 4k+9
    542 
    543 	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
    544 	fmov	v1.d[1], r9                               @ CTR block 4k+9
    545 	rev	r9, r12                                 @ CTR block 4k+10
    546 
    547 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
    548 	st1	{ q4}, [r2], #16                     @ AES block 4k+4 - store result
    549 	eor	q6, q6, q2                          @ AES block 4k+6 - result
    550 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
    551 
    552 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
    553 	add	r12, r12, #1                            @ CTR block 4k+10
    554 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
    555 	fmov	d2, r10                               @ CTR block 4k+10
    556 
    557 	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
    558 	st1	{ q5}, [r2], #16                     @ AES block 4k+5 - store result
    559 
    560 	fmov	v2.d[1], r9                               @ CTR block 4k+10
    561 	st1	{ q6}, [r2], #16                     @ AES block 4k+6 - store result
    562 	rev	r9, r12                                 @ CTR block 4k+11
    563 
    564 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+11
    565 	eor	q7, q7, q3                          @ AES block 4k+3 - result
    566 
    567 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
    568 	st1	{ q7}, [r2], #16                     @ AES block 4k+3 - store result
    569 	blt	.L128_enc_main_loop
    570 
    571 .L128_enc_prepretail:@ PREPRETAIL
    572 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
    573 	fmov	d3, r10                               @ CTR block 4k+3
    574 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
    575 
    576 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
    577 	add	r12, r12, #1                            @ CTR block 4k+3
    578 	fmov	v3.d[1], r9                               @ CTR block 4k+3
    579 
    580 	aese	q1, v18.16b
    581 	aesmc	q1, q1          @ AES block 4k+5 - round 0
    582 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
    583 
    584 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
    585 
    586 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
    587 	eor	q4, q4, v11.16b                           @ PRE 1
    588 
    589 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
    590 
    591 	aese	q3, v18.16b
    592 	aesmc	q3, q3          @ AES block 4k+7 - round 0
    593 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
    594 
    595 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
    596 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
    597 
    598 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
    599 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
    600 
    601 	aese	q1, v19.16b
    602 	aesmc	q1, q1          @ AES block 4k+5 - round 1
    603 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
    604 
    605 	eor	q8, q8, q4                          @ GHASH block 4k - mid
    606 
    607 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
    608 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
    609 
    610 	aese	q3, v19.16b
    611 	aesmc	q3, q3          @ AES block 4k+7 - round 1
    612 
    613 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
    614 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
    615 
    616 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
    617 
    618 	aese	q0, v18.16b
    619 	aesmc	q0, q0          @ AES block 4k+4 - round 0
    620 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
    621 
    622 	aese	q2, v18.16b
    623 	aesmc	q2, q2          @ AES block 4k+6 - round 0
    624 
    625 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
    626 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
    627 
    628 	aese	q0, v19.16b
    629 	aesmc	q0, q0          @ AES block 4k+4 - round 1
    630 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
    631 
    632 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
    633 
    634 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
    635 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
    636 
    637 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
    638 
    639 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
    640 
    641 	aese	q2, v19.16b
    642 	aesmc	q2, q2          @ AES block 4k+6 - round 1
    643 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
    644 
    645 	aese	q0, v20.16b
    646 	aesmc	q0, q0          @ AES block 4k+4 - round 2
    647 
    648 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
    649 	movi	q8, #0xc2
    650 
    651 	aese	q2, v20.16b
    652 	aesmc	q2, q2          @ AES block 4k+6 - round 2
    653 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
    654 
    655 	aese	q3, v20.16b
    656 	aesmc	q3, q3          @ AES block 4k+7 - round 2
    657 
    658 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
    659 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
    660 
    661 	aese	q2, v21.16b
    662 	aesmc	q2, q2          @ AES block 4k+6 - round 3
    663 
    664 	aese	q1, v20.16b
    665 	aesmc	q1, q1          @ AES block 4k+5 - round 2
    666 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
    667 
    668 	aese	q0, v21.16b
    669 	aesmc	q0, q0          @ AES block 4k+4 - round 3
    670 
    671 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
    672 	shl	d8, d8, #56               @ mod_constant
    673 
    674 	aese	q1, v21.16b
    675 	aesmc	q1, q1          @ AES block 4k+5 - round 3
    676 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
    677 
    678 	aese	q0, v22.16b
    679 	aesmc	q0, q0          @ AES block 4k+4 - round 4
    680 
    681 	pmull	v28.1q, q9, q8
    682 	eor	v10.16b, v10.16b, q9                         @ karatsuba tidy up
    683 
    684 	aese	q1, v22.16b
    685 	aesmc	q1, q1          @ AES block 4k+5 - round 4
    686 
    687 	aese	q0, v23.16b
    688 	aesmc	q0, q0          @ AES block 4k+4 - round 5
    689 	ext	q9, q9, q9, #8
    690 
    691 	aese	q3, v21.16b
    692 	aesmc	q3, q3          @ AES block 4k+7 - round 3
    693 
    694 	aese	q2, v22.16b
    695 	aesmc	q2, q2          @ AES block 4k+6 - round 4
    696 	eor	v10.16b, v10.16b, v11.16b
    697 
    698 	aese	q0, v24.16b
    699 	aesmc	q0, q0          @ AES block 4k+4 - round 6
    700 
    701 	aese	q3, v22.16b
    702 	aesmc	q3, q3          @ AES block 4k+7 - round 4
    703 
    704 	aese	q1, v23.16b
    705 	aesmc	q1, q1          @ AES block 4k+5 - round 5
    706 
    707 	aese	q2, v23.16b
    708 	aesmc	q2, q2          @ AES block 4k+6 - round 5
    709 	eor	v10.16b, v10.16b, v28.16b
    710 
    711 	aese	q3, v23.16b
    712 	aesmc	q3, q3          @ AES block 4k+7 - round 5
    713 
    714 	aese	q1, v24.16b
    715 	aesmc	q1, q1          @ AES block 4k+5 - round 6
    716 
    717 	aese	q2, v24.16b
    718 	aesmc	q2, q2          @ AES block 4k+6 - round 6
    719 
    720 	aese	q3, v24.16b
    721 	aesmc	q3, q3          @ AES block 4k+7 - round 6
    722 	eor	v10.16b, v10.16b, q9
    723 
    724 	aese	q0, v25.16b
    725 	aesmc	q0, q0          @ AES block 4k+4 - round 7
    726 
    727 	aese	q2, v25.16b
    728 	aesmc	q2, q2          @ AES block 4k+6 - round 7
    729 
    730 	aese	q3, v25.16b
    731 	aesmc	q3, q3          @ AES block 4k+7 - round 7
    732 
    733 	pmull	v28.1q, v10.1d, q8
    734 
    735 	aese	q1, v25.16b
    736 	aesmc	q1, q1          @ AES block 4k+5 - round 7
    737 	ext	v10.16b, v10.16b, v10.16b, #8
    738 
    739 	aese	q3, v26.16b
    740 	aesmc	q3, q3          @ AES block 4k+7 - round 8
    741 
    742 	aese	q0, v26.16b
    743 	aesmc	q0, q0          @ AES block 4k+4 - round 8
    744 	eor	v11.16b, v11.16b, v28.16b
    745 
    746 	aese	q1, v26.16b
    747 	aesmc	q1, q1          @ AES block 4k+5 - round 8
    748 
    749 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
    750 
    751 	aese	q2, v26.16b
    752 	aesmc	q2, q2          @ AES block 4k+6 - round 8
    753 
    754 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
    755 
    756 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
    757 	eor	v11.16b, v11.16b, v10.16b
    758 
    759 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
    760 .L128_enc_tail:@ TAIL
    761 
    762 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
    763 	ldp	r6, r7, [r0], #16           @ AES block 4k+4 - load plaintext
    764 #ifdef __ARMEB__
    765 	rev	r6, r6
    766 	rev	r7, r7
    767 #endif
    768 	cmp	r5, #48
    769 
    770 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
    771 	eor	r6, r6, r13                     @ AES block 4k+4 - round 10 low
    772 	eor	r7, r7, r14                     @ AES block 4k+4 - round 10 high
    773 
    774 	fmov	d4, r6                               @ AES block 4k+4 - mov low
    775 
    776 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
    777 
    778 	eor	q5, q4, q0                          @ AES block 4k+4 - result
    779 
    780 	bgt	.L128_enc_blocks_more_than_3
    781 
    782 	sub	r12, r12, #1
    783 	movi	v11.8b, #0
    784 	mov	q3, q2
    785 
    786 	cmp	r5, #32
    787 	mov	q2, q1
    788 	movi	q9, #0
    789 
    790 	movi	v10.8b, #0
    791 	bgt	.L128_enc_blocks_more_than_2
    792 
    793 	mov	q3, q1
    794 	cmp	r5, #16
    795 
    796 	sub	r12, r12, #1
    797 	bgt	.L128_enc_blocks_more_than_1
    798 
    799 	sub	r12, r12, #1
    800 	b	.L128_enc_blocks_less_than_1
    801 .L128_enc_blocks_more_than_3:@ blocks left >  3
    802 	st1	{ q5}, [r2], #16                     @ AES final-3 block  - store result
    803 
    804 	ldp	r6, r7, [r0], #16           @ AES final-2 block - load input low & high
    805 #ifdef __ARMEB__
    806 	rev	r6, r6
    807 	rev	r7, r7
    808 #endif
    809 	rev64	q4, q5                                    @ GHASH final-3 block
    810 
    811 	eor	q4, q4, q8                           @ feed in partial tag
    812 	eor	r7, r7, r14                     @ AES final-2 block - round 10 high
    813 	eor	r6, r6, r13                     @ AES final-2 block - round 10 low
    814 
    815 	fmov	d5, r6                                 @ AES final-2 block - mov low
    816 
    817 	movi	q8, #0                                        @ suppress further partial tag feed in
    818 	fmov	v5.d[1], r7                             @ AES final-2 block - mov high
    819 
    820 	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
    821 	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
    822 
    823 	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
    824 
    825 	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
    826 
    827 	eor	q5, q5, q1                            @ AES final-2 block - result
    828 	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
    829 
    830 	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
    831 .L128_enc_blocks_more_than_2:@ blocks left >  2
    832 
    833 	st1	{ q5}, [r2], #16                     @ AES final-2 block - store result
    834 
    835 	rev64	q4, q5                                    @ GHASH final-2 block
    836 	ldp	r6, r7, [r0], #16           @ AES final-1 block - load input low & high
    837 #ifdef __ARMEB__
    838 	rev	r6, r6
    839 	rev	r7, r7
    840 #endif
    841 	eor	q4, q4, q8                           @ feed in partial tag
    842 
    843 	eor	r6, r6, r13                     @ AES final-1 block - round 10 low
    844 
    845 	fmov	d5, r6                                 @ AES final-1 block - mov low
    846 	eor	r7, r7, r14                     @ AES final-1 block - round 10 high
    847 
    848 	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
    849 	fmov	v5.d[1], r7                             @ AES final-1 block - mov high
    850 
    851 	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
    852 
    853 	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
    854 
    855 	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
    856 
    857 	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
    858 
    859 	eor	q5, q5, q2                            @ AES final-1 block - result
    860 
    861 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
    862 
    863 	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
    864 
    865 	movi	q8, #0                                        @ suppress further partial tag feed in
    866 
    867 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
    868 .L128_enc_blocks_more_than_1:@ blocks left >  1
    869 
    870 	st1	{ q5}, [r2], #16                     @ AES final-1 block - store result
    871 
    872 	rev64	q4, q5                                    @ GHASH final-1 block
    873 	ldp	r6, r7, [r0], #16           @ AES final block - load input low & high
    874 #ifdef __ARMEB__
    875 	rev	r6, r6
    876 	rev	r7, r7
    877 #endif
    878 	eor	q4, q4, q8                           @ feed in partial tag
    879 
    880 	eor	r7, r7, r14                     @ AES final block - round 10 high
    881 	eor	r6, r6, r13                     @ AES final block - round 10 low
    882 
    883 	fmov	d5, r6                                 @ AES final block - mov low
    884 
    885 	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
    886 	fmov	v5.d[1], r7                             @ AES final block - mov high
    887 
    888 	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
    889 
    890 	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
    891 
    892 	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
    893 
    894 	eor	q5, q5, q3                            @ AES final block - result
    895 
    896 	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
    897 
    898 	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
    899 
    900 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
    901 
    902 	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
    903 
    904 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
    905 	movi	q8, #0                                        @ suppress further partial tag feed in
    906 .L128_enc_blocks_less_than_1:@ blocks left <= 1
    907 
    908 	and	r1, r1, #127                    @ bit_length %= 128
    909 	mvn	r13, xzr                                      @ rk10_l = 0xffffffffffffffff
    910 
    911 	mvn	r14, xzr                                      @ rk10_h = 0xffffffffffffffff
    912 	sub	r1, r1, #128                    @ bit_length -= 128
    913 
    914 	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
    915 
    916 	and	r1, r1, #127                    @ bit_length %= 128
    917 
    918 	lsr	r14, r14, r1                     @ rk10_h is mask for top 64b of last block
    919 	cmp	r1, #64
    920 
    921 	csel	r6, r13, r14, lt
    922 	csel	r7, r14, xzr, lt
    923 
    924 	fmov	d0, r6                                 @ ctr0b is mask for last block
    925 
    926 	fmov	v0.d[1], r7
    927 
    928 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
    929 
    930 	rev64	q4, q5                                    @ GHASH final block
    931 
    932 	eor	q4, q4, q8                           @ feed in partial tag
    933 
    934 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
    935 
    936 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
    937 	ld1	{ v18.16b}, [r2]                            @ load existing bytes where the possibly partial last block is to be stored
    938 
    939 	eor	q8, q8, q4                          @ GHASH final block - mid
    940 #ifndef __ARMEB__
    941 	rev	r9, r12
    942 #else
    943 	mov	r9, r12
    944 #endif
    945 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
    946 
    947 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
    948 
    949 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
    950 
    951 	eor	q9, q9, v20.16b                            @ GHASH final block - high
    952 
    953 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
    954 	movi	q8, #0xc2
    955 
    956 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
    957 
    958 	shl	d8, d8, #56               @ mod_constant
    959 
    960 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
    961 
    962 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
    963 
    964 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
    965 
    966 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
    967 
    968 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
    969 
    970 	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
    971 
    972 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
    973 
    974 	bif	q5, v18.16b, q0                              @ insert existing bytes in top end of result before storing
    975 
    976 	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
    977 	st1	{ q5}, [r2]                          @ store all 16B
    978 
    979 	str	r9, [r16, #12]                          @ store the updated counter
    980 
    981 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
    982 	ext	v11.16b, v11.16b, v11.16b, #8
    983 	rev64	v11.16b, v11.16b
    984 	mov	r0, r15
    985 	st1	{ v11.16b }, [r3]
    986 	ldp	r21, r22, [sp, #16]
    987 	ldp	r23, r24, [sp, #32]
    988 	ldp	d8, d9, [sp, #48]
    989 	ldp	d10, d11, [sp, #64]
    990 	ldp	d12, d13, [sp, #80]
    991 	ldp	d14, d15, [sp, #96]
    992 	ldp	r19, r20, [sp], #112
    993 	RET
    994 
    995 .L128_enc_ret:
    996 	mov	r0, #0x0
    997 	RET
    998 .size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
    999 .globl	aes_gcm_dec_128_kernel
   1000 .type	aes_gcm_dec_128_kernel,%function
   1001 .align	4
   1002 aes_gcm_dec_128_kernel:
   1003 	AARCH64_VALID_CALL_TARGET
   1004 	cbz	r1, .L128_dec_ret
   1005 	stp	r19, r20, [sp, #-112]!
   1006 	mov	r16, r4
   1007 	mov	r8, r5
   1008 	stp	r21, r22, [sp, #16]
   1009 	stp	r23, r24, [sp, #32]
   1010 	stp	d8, d9, [sp, #48]
   1011 	stp	d10, d11, [sp, #64]
   1012 	stp	d12, d13, [sp, #80]
   1013 	stp	d14, d15, [sp, #96]
   1014 
   1015 	lsr	r5, r1, #3              @ byte_len
   1016 	mov	r15, r5
   1017 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   1018 #ifdef __ARMEB__
   1019 	rev	r10, r10
   1020 	rev	r11, r11
   1021 #endif
   1022 	ldp	r13, r14, [r8, #160]                     @ load rk10
   1023 #ifdef __ARMEB__
   1024 	ror	r14, r14, 32
   1025 	ror	r13, r13, 32
   1026 #endif
   1027 	sub	r5, r5, #1      @ byte_len - 1
   1028 	ld1	{v18.4s}, [r8], #16                                @ load rk0
   1029 
   1030 	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   1031 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   1032 
   1033 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   1034 #ifndef __ARMEB__
   1035 	ext	v13.16b, v13.16b, v13.16b, #8
   1036 #endif
   1037 	lsr	r12, r11, #32
   1038 	fmov	d2, r10                               @ CTR block 2
   1039 
   1040 	ld1	{v19.4s}, [r8], #16                                @ load rk1
   1041 	orr	r11, r11, r11
   1042 	rev	r12, r12                                @ rev_ctr32
   1043 
   1044 	fmov	d1, r10                               @ CTR block 1
   1045 	add	r12, r12, #1                            @ increment rev_ctr32
   1046 
   1047 	aese	q0, v18.16b
   1048 	aesmc	q0, q0          @ AES block 0 - round 0
   1049 	rev	r9, r12                                 @ CTR block 1
   1050 
   1051 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   1052 	ld1	{v20.4s}, [r8], #16                                @ load rk2
   1053 	add	r12, r12, #1                            @ CTR block 1
   1054 
   1055 	fmov	v1.d[1], r9                               @ CTR block 1
   1056 	rev	r9, r12                                 @ CTR block 2
   1057 	add	r12, r12, #1                            @ CTR block 2
   1058 
   1059 	aese	q0, v19.16b
   1060 	aesmc	q0, q0          @ AES block 0 - round 1
   1061 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   1062 
   1063 	fmov	v2.d[1], r9                               @ CTR block 2
   1064 	rev	r9, r12                                 @ CTR block 3
   1065 
   1066 	fmov	d3, r10                               @ CTR block 3
   1067 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   1068 	add	r12, r12, #1                            @ CTR block 3
   1069 
   1070 	fmov	v3.d[1], r9                               @ CTR block 3
   1071 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   1072 
   1073 	aese	q1, v18.16b
   1074 	aesmc	q1, q1          @ AES block 1 - round 0
   1075 	ld1	{v21.4s}, [r8], #16                                @ load rk3
   1076 
   1077 	aese	q0, v20.16b
   1078 	aesmc	q0, q0          @ AES block 0 - round 2
   1079 	ld1	{v22.4s}, [r8], #16                                @ load rk4
   1080 
   1081 	aese	q2, v18.16b
   1082 	aesmc	q2, q2          @ AES block 2 - round 0
   1083 	ld1	{v23.4s}, [r8], #16                                @ load rk5
   1084 
   1085 	aese	q1, v19.16b
   1086 	aesmc	q1, q1          @ AES block 1 - round 1
   1087 	ld1	{v24.4s}, [r8], #16                                @ load rk6
   1088 
   1089 	aese	q3, v18.16b
   1090 	aesmc	q3, q3          @ AES block 3 - round 0
   1091 
   1092 	aese	q2, v19.16b
   1093 	aesmc	q2, q2          @ AES block 2 - round 1
   1094 
   1095 	aese	q1, v20.16b
   1096 	aesmc	q1, q1          @ AES block 1 - round 2
   1097 
   1098 	aese	q3, v19.16b
   1099 	aesmc	q3, q3          @ AES block 3 - round 1
   1100 	ld1	{ v11.16b}, [r3]
   1101 	ext	v11.16b, v11.16b, v11.16b, #8
   1102 	rev64	v11.16b, v11.16b
   1103 
   1104 	aese	q0, v21.16b
   1105 	aesmc	q0, q0          @ AES block 0 - round 3
   1106 	ld1	{v25.4s}, [r8], #16                                @ load rk7
   1107 
   1108 	aese	q1, v21.16b
   1109 	aesmc	q1, q1          @ AES block 1 - round 3
   1110 
   1111 	aese	q3, v20.16b
   1112 	aesmc	q3, q3          @ AES block 3 - round 2
   1113 
   1114 	aese	q2, v20.16b
   1115 	aesmc	q2, q2          @ AES block 2 - round 2
   1116 	ld1	{v26.4s}, [r8], #16                                @ load rk8
   1117 
   1118 	aese	q1, v22.16b
   1119 	aesmc	q1, q1          @ AES block 1 - round 4
   1120 
   1121 	aese	q3, v21.16b
   1122 	aesmc	q3, q3          @ AES block 3 - round 3
   1123 
   1124 	aese	q2, v21.16b
   1125 	aesmc	q2, q2          @ AES block 2 - round 3
   1126 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   1127 #ifndef __ARMEB__
   1128 	ext	v14.16b, v14.16b, v14.16b, #8
   1129 #endif
   1130 	aese	q0, v22.16b
   1131 	aesmc	q0, q0          @ AES block 0 - round 4
   1132 	ld1	{v27.4s}, [r8], #16                                @ load rk9
   1133 
   1134 	aese	q1, v23.16b
   1135 	aesmc	q1, q1          @ AES block 1 - round 5
   1136 
   1137 	aese	q2, v22.16b
   1138 	aesmc	q2, q2          @ AES block 2 - round 4
   1139 
   1140 	aese	q3, v22.16b
   1141 	aesmc	q3, q3          @ AES block 3 - round 4
   1142 
   1143 	aese	q0, v23.16b
   1144 	aesmc	q0, q0          @ AES block 0 - round 5
   1145 
   1146 	aese	q2, v23.16b
   1147 	aesmc	q2, q2          @ AES block 2 - round 5
   1148 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   1149 #ifndef __ARMEB__
   1150 	ext	v12.16b, v12.16b, v12.16b, #8
   1151 #endif
   1152 	aese	q3, v23.16b
   1153 	aesmc	q3, q3          @ AES block 3 - round 5
   1154 
   1155 	aese	q0, v24.16b
   1156 	aesmc	q0, q0          @ AES block 0 - round 6
   1157 
   1158 	aese	q1, v24.16b
   1159 	aesmc	q1, q1          @ AES block 1 - round 6
   1160 
   1161 	aese	q3, v24.16b
   1162 	aesmc	q3, q3          @ AES block 3 - round 6
   1163 
   1164 	aese	q2, v24.16b
   1165 	aesmc	q2, q2          @ AES block 2 - round 6
   1166 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   1167 
   1168 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   1169 #ifndef __ARMEB__
   1170 	ext	v15.16b, v15.16b, v15.16b, #8
   1171 #endif
   1172 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   1173 	add	r5, r5, r0
   1174 
   1175 	aese	q1, v25.16b
   1176 	aesmc	q1, q1          @ AES block 1 - round 7
   1177 
   1178 	aese	q2, v25.16b
   1179 	aesmc	q2, q2          @ AES block 2 - round 7
   1180 
   1181 	aese	q0, v25.16b
   1182 	aesmc	q0, q0          @ AES block 0 - round 7
   1183 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   1184 
   1185 	aese	q3, v25.16b
   1186 	aesmc	q3, q3          @ AES block 3 - round 7
   1187 
   1188 	aese	q1, v26.16b
   1189 	aesmc	q1, q1          @ AES block 1 - round 8
   1190 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   1191 
   1192 	aese	q2, v26.16b
   1193 	aesmc	q2, q2          @ AES block 2 - round 8
   1194 
   1195 	aese	q3, v26.16b
   1196 	aesmc	q3, q3          @ AES block 3 - round 8
   1197 
   1198 	aese	q0, v26.16b
   1199 	aesmc	q0, q0          @ AES block 0 - round 8
   1200 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   1201 
   1202 	aese	q2, v27.16b                                      @ AES block 2 - round 9
   1203 
   1204 	aese	q3, v27.16b                                      @ AES block 3 - round 9
   1205 
   1206 	aese	q0, v27.16b                                      @ AES block 0 - round 9
   1207 	cmp	r0, r5                   @ check if we have <= 4 blocks
   1208 
   1209 	aese	q1, v27.16b                                      @ AES block 1 - round 9
   1210 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   1211 	bge	.L128_dec_tail                                    @ handle tail
   1212 
   1213 	ld1	{q4, q5}, [r0], #32               @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
   1214 
   1215 	eor	q1, q5, q1                            @ AES block 1 - result
   1216 	ld1	{q6}, [r0], #16                       @ AES block 2 - load ciphertext
   1217 
   1218 	eor	q0, q4, q0                            @ AES block 0 - result
   1219 	rev64	q4, q4                                    @ GHASH block 0
   1220 	rev	r9, r12                                 @ CTR block 4
   1221 
   1222 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   1223 	add	r12, r12, #1                            @ CTR block 4
   1224 	ld1	{q7}, [r0], #16                       @ AES block 3 - load ciphertext
   1225 
   1226 	rev64	q5, q5                                    @ GHASH block 1
   1227 	mov	r19, v1.d[0]                            @ AES block 1 - mov low
   1228 
   1229 	mov	r20, v1.d[1]                            @ AES block 1 - mov high
   1230 
   1231 	mov	r6, v0.d[0]                            @ AES block 0 - mov low
   1232 	cmp	r0, r5                   @ check if we have <= 8 blocks
   1233 
   1234 	mov	r7, v0.d[1]                            @ AES block 0 - mov high
   1235 
   1236 	fmov	d0, r10                               @ CTR block 4
   1237 
   1238 	fmov	v0.d[1], r9                               @ CTR block 4
   1239 	rev	r9, r12                                 @ CTR block 5
   1240 	eor	r19, r19, r13                   @ AES block 1 - round 10 low
   1241 #ifdef __ARMEB__
   1242 	rev	r19, r19
   1243 #endif
   1244 	fmov	d1, r10                               @ CTR block 5
   1245 	add	r12, r12, #1                            @ CTR block 5
   1246 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   1247 
   1248 	fmov	v1.d[1], r9                               @ CTR block 5
   1249 	rev	r9, r12                                 @ CTR block 6
   1250 	add	r12, r12, #1                            @ CTR block 6
   1251 
   1252 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   1253 
   1254 	eor	r20, r20, r14                   @ AES block 1 - round 10 high
   1255 #ifdef __ARMEB__
   1256 	rev	r20, r20
   1257 #endif
   1258 	eor	r6, r6, r13                   @ AES block 0 - round 10 low
   1259 #ifdef __ARMEB__
   1260 	rev	r6, r6
   1261 #endif
   1262 	eor	q2, q6, q2                            @ AES block 2 - result
   1263 
   1264 	eor	r7, r7, r14                   @ AES block 0 - round 10 high
   1265 #ifdef __ARMEB__
   1266 	rev	r7, r7
   1267 #endif
   1268 	stp	r6, r7, [r2], #16        @ AES block 0 - store result
   1269 
   1270 	stp	r19, r20, [r2], #16        @ AES block 1 - store result
   1271 	bge	.L128_dec_prepretail                              @ do prepretail
   1272 
   1273 .L128_dec_main_loop:@ main loop start
   1274 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   1275 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   1276 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   1277 
   1278 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   1279 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   1280 
   1281 	aese	q1, v18.16b
   1282 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   1283 	fmov	d2, r10                               @ CTR block 4k+6
   1284 
   1285 	rev64	q6, q6                                    @ GHASH block 4k+2
   1286 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   1287 	rev	r9, r12                                 @ CTR block 4k+7
   1288 
   1289 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   1290 	eor	q4, q4, v11.16b                           @ PRE 1
   1291 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
   1292 
   1293 	aese	q1, v19.16b
   1294 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   1295 	rev64	q7, q7                                    @ GHASH block 4k+3
   1296 
   1297 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   1298 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   1299 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   1300 
   1301 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   1302 	fmov	d3, r10                               @ CTR block 4k+7
   1303 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
   1304 
   1305 	aese	q1, v20.16b
   1306 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   1307 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   1308 
   1309 	aese	q2, v18.16b
   1310 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   1311 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   1312 
   1313 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   1314 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
   1315 
   1316 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   1317 
   1318 	aese	q1, v21.16b
   1319 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   1320 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   1321 
   1322 	aese	q3, v18.16b
   1323 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   1324 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
   1325 
   1326 	aese	q0, v18.16b
   1327 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   1328 
   1329 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   1330 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   1331 
   1332 	aese	q3, v19.16b
   1333 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   1334 	eor	r23, r23, r13                   @ AES block 4k+3 - round 10 low
   1335 #ifdef __ARMEB__
   1336 	rev	r23, r23
   1337 #endif
   1338 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
   1339 	eor	r22, r22, r14                   @ AES block 4k+2 - round 10 high
   1340 #ifdef __ARMEB__
   1341 	rev	r22, r22
   1342 #endif
   1343 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   1344 
   1345 	aese	q0, v19.16b
   1346 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   1347 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
   1348 
   1349 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   1350 
   1351 	aese	q3, v20.16b
   1352 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   1353 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   1354 
   1355 	aese	q0, v20.16b
   1356 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   1357 
   1358 	aese	q1, v22.16b
   1359 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   1360 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
   1361 
   1362 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   1363 
   1364 	aese	q0, v21.16b
   1365 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   1366 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   1367 
   1368 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   1369 
   1370 	aese	q2, v19.16b
   1371 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   1372 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   1373 
   1374 	aese	q0, v22.16b
   1375 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   1376 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
   1377 
   1378 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   1379 	eor	r24, r24, r14                   @ AES block 4k+3 - round 10 high
   1380 #ifdef __ARMEB__
   1381 	rev	r24, r24
   1382 #endif
   1383 	aese	q2, v20.16b
   1384 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   1385 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   1386 
   1387 	aese	q1, v23.16b
   1388 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   1389 	eor	r21, r21, r13                   @ AES block 4k+2 - round 10 low
   1390 #ifdef __ARMEB__
   1391 	rev	r21, r21
   1392 #endif
   1393 	aese	q0, v23.16b
   1394 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   1395 	movi	q8, #0xc2
   1396 
   1397 	aese	q2, v21.16b
   1398 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   1399 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
   1400 
   1401 	aese	q1, v24.16b
   1402 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   1403 
   1404 	aese	q0, v24.16b
   1405 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   1406 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   1407 
   1408 	aese	q2, v22.16b
   1409 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   1410 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   1411 
   1412 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   1413 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
   1414 	ld1	{q4}, [r0], #16                       @ AES block 4k+3 - load ciphertext
   1415 
   1416 	aese	q1, v25.16b
   1417 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   1418 	add	r12, r12, #1                            @ CTR block 4k+7
   1419 
   1420 	aese	q0, v25.16b
   1421 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   1422 	shl	d8, d8, #56               @ mod_constant
   1423 
   1424 	aese	q2, v23.16b
   1425 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   1426 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   1427 
   1428 	aese	q1, v26.16b
   1429 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   1430 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   1431 
   1432 	aese	q0, v26.16b
   1433 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   1434 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   1435 
   1436 	aese	q3, v21.16b
   1437 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   1438 	rev	r9, r12                                 @ CTR block 4k+8
   1439 
   1440 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   1441 	ld1	{q5}, [r0], #16                       @ AES block 4k+4 - load ciphertext
   1442 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   1443 
   1444 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
   1445 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   1446 
   1447 	aese	q3, v22.16b
   1448 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   1449 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   1450 
   1451 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
   1452 
   1453 	aese	q2, v24.16b
   1454 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   1455 	eor	q0, q4, q0                            @ AES block 4k+4 - result
   1456 
   1457 	aese	q3, v23.16b
   1458 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   1459 	ld1	{q6}, [r0], #16                       @ AES block 4k+5 - load ciphertext
   1460 
   1461 	add	r12, r12, #1                            @ CTR block 4k+8
   1462 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   1463 	eor	q1, q5, q1                            @ AES block 4k+5 - result
   1464 
   1465 	aese	q2, v25.16b
   1466 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   1467 	ld1	{q7}, [r0], #16                       @ AES block 4k+6 - load ciphertext
   1468 
   1469 	aese	q3, v24.16b
   1470 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   1471 
   1472 	rev64	q5, q5                                    @ GHASH block 4k+5
   1473 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   1474 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   1475 
   1476 	aese	q2, v26.16b
   1477 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   1478 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   1479 
   1480 	aese	q3, v25.16b
   1481 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   1482 	fmov	d0, r10                               @ CTR block 4k+8
   1483 
   1484 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   1485 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   1486 	rev	r9, r12                                 @ CTR block 4k+9
   1487 
   1488 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
   1489 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   1490 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   1491 
   1492 	aese	q3, v26.16b
   1493 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   1494 	eor	r7, r7, r14                   @ AES block 4k+4 - round 10 high
   1495 #ifdef __ARMEB__
   1496 	rev	r7, r7
   1497 #endif
   1498 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   1499 	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
   1500 	eor	r6, r6, r13                   @ AES block 4k+4 - round 10 low
   1501 #ifdef __ARMEB__
   1502 	rev	r6, r6
   1503 #endif
   1504 	eor	q2, q6, q2                            @ AES block 4k+6 - result
   1505 	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
   1506 	add	r12, r12, #1                            @ CTR block 4k+9
   1507 
   1508 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
   1509 	fmov	d1, r10                               @ CTR block 4k+9
   1510 	cmp	r0, r5                   @ .LOOP CONTROL
   1511 
   1512 	rev64	q4, q4                                    @ GHASH block 4k+4
   1513 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   1514 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   1515 
   1516 	rev	r9, r12                                 @ CTR block 4k+10
   1517 	add	r12, r12, #1                            @ CTR block 4k+10
   1518 
   1519 	eor	r20, r20, r14                   @ AES block 4k+5 - round 10 high
   1520 #ifdef __ARMEB__
   1521 	rev	r20, r20
   1522 #endif
   1523 	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
   1524 
   1525 	eor	r19, r19, r13                   @ AES block 4k+5 - round 10 low
   1526 #ifdef __ARMEB__
   1527 	rev	r19, r19
   1528 #endif
   1529 	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
   1530 
   1531 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   1532 	blt	.L128_dec_main_loop
   1533 
   1534 .L128_dec_prepretail:@ PREPRETAIL
   1535 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   1536 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   1537 	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
   1538 
   1539 	aese	q0, v18.16b
   1540 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   1541 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   1542 
   1543 	aese	q1, v18.16b
   1544 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   1545 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   1546 
   1547 	eor	q4, q4, v11.16b                           @ PRE 1
   1548 	fmov	d2, r10                               @ CTR block 4k+6
   1549 	rev64	q6, q6                                    @ GHASH block 4k+2
   1550 
   1551 	aese	q0, v19.16b
   1552 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   1553 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   1554 
   1555 	rev	r9, r12                                 @ CTR block 4k+7
   1556 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   1557 	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
   1558 
   1559 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   1560 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   1561 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   1562 
   1563 	aese	q1, v19.16b
   1564 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   1565 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   1566 
   1567 	aese	q0, v20.16b
   1568 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   1569 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   1570 
   1571 	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   1572 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   1573 	fmov	d3, r10                               @ CTR block 4k+7
   1574 
   1575 	aese	q2, v18.16b
   1576 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   1577 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   1578 
   1579 	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
   1580 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   1581 
   1582 	rev64	q7, q7                                    @ GHASH block 4k+3
   1583 
   1584 	aese	q2, v19.16b
   1585 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   1586 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   1587 
   1588 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   1589 
   1590 	aese	q3, v18.16b
   1591 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   1592 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   1593 
   1594 	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   1595 
   1596 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   1597 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
   1598 
   1599 	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   1600 
   1601 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   1602 	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
   1603 
   1604 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
   1605 
   1606 	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   1607 
   1608 	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   1609 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   1610 
   1611 	aese	q1, v20.16b
   1612 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   1613 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   1614 
   1615 	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   1616 
   1617 	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
   1618 	movi	q8, #0xc2
   1619 
   1620 	aese	q3, v19.16b
   1621 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   1622 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   1623 
   1624 	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
   1625 
   1626 	aese	q2, v20.16b
   1627 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   1628 	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
   1629 
   1630 	aese	q3, v20.16b
   1631 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   1632 	eor	r23, r23, r13                   @ AES block 4k+3 - round 10 low
   1633 #ifdef __ARMEB__
   1634 	rev	r23, r23
   1635 #endif
   1636 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   1637 	eor	r21, r21, r13                   @ AES block 4k+2 - round 10 low
   1638 #ifdef __ARMEB__
   1639 	rev	r21, r21
   1640 #endif
   1641 	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
   1642 
   1643 	aese	q2, v21.16b
   1644 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   1645 
   1646 	aese	q1, v21.16b
   1647 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   1648 	shl	d8, d8, #56               @ mod_constant
   1649 
   1650 	aese	q0, v21.16b
   1651 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   1652 
   1653 	aese	q2, v22.16b
   1654 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   1655 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   1656 
   1657 	aese	q1, v22.16b
   1658 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   1659 
   1660 	aese	q3, v21.16b
   1661 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   1662 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   1663 
   1664 	aese	q2, v23.16b
   1665 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   1666 
   1667 	aese	q1, v23.16b
   1668 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   1669 
   1670 	aese	q3, v22.16b
   1671 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   1672 
   1673 	aese	q0, v22.16b
   1674 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   1675 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   1676 
   1677 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   1678 
   1679 	aese	q1, v24.16b
   1680 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   1681 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   1682 
   1683 	aese	q3, v23.16b
   1684 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   1685 
   1686 	aese	q0, v23.16b
   1687 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   1688 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   1689 
   1690 	aese	q1, v25.16b
   1691 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   1692 
   1693 	aese	q2, v24.16b
   1694 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   1695 
   1696 	aese	q0, v24.16b
   1697 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   1698 
   1699 	aese	q1, v26.16b
   1700 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   1701 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   1702 
   1703 	aese	q3, v24.16b
   1704 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   1705 
   1706 	aese	q0, v25.16b
   1707 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   1708 
   1709 	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
   1710 
   1711 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   1712 	eor	r24, r24, r14                   @ AES block 4k+3 - round 10 high
   1713 #ifdef __ARMEB__
   1714 	rev	r24, r24
   1715 #endif
   1716 	aese	q2, v25.16b
   1717 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   1718 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   1719 
   1720 	aese	q3, v25.16b
   1721 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   1722 
   1723 	aese	q0, v26.16b
   1724 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   1725 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   1726 
   1727 	aese	q2, v26.16b
   1728 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   1729 
   1730 	aese	q3, v26.16b
   1731 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   1732 	eor	r22, r22, r14                   @ AES block 4k+2 - round 10 high
   1733 #ifdef __ARMEB__
   1734 	rev	r22, r22
   1735 #endif
   1736 	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
   1737 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   1738 
   1739 	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
   1740 	add	r12, r12, #1                            @ CTR block 4k+7
   1741 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   1742 
   1743 	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
   1744 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   1745 .L128_dec_tail:@ TAIL
   1746 
   1747 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   1748 	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
   1749 
   1750 	eor	q0, q5, q0                            @ AES block 4k+4 - result
   1751 
   1752 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   1753 
   1754 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   1755 
   1756 	cmp	r5, #48
   1757 
   1758 	eor	r7, r7, r14                   @ AES block 4k+4 - round 10 high
   1759 #ifdef __ARMEB__
   1760 	rev	r7, r7
   1761 #endif
   1762 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   1763 	eor	r6, r6, r13                   @ AES block 4k+4 - round 10 low
   1764 #ifdef __ARMEB__
   1765 	rev	r6, r6
   1766 #endif
   1767 	bgt	.L128_dec_blocks_more_than_3
   1768 
   1769 	mov	q3, q2
   1770 	sub	r12, r12, #1
   1771 	movi	v11.8b, #0
   1772 
   1773 	movi	q9, #0
   1774 	mov	q2, q1
   1775 
   1776 	movi	v10.8b, #0
   1777 	cmp	r5, #32
   1778 	bgt	.L128_dec_blocks_more_than_2
   1779 
   1780 	cmp	r5, #16
   1781 
   1782 	mov	q3, q1
   1783 	sub	r12, r12, #1
   1784 	bgt	.L128_dec_blocks_more_than_1
   1785 
   1786 	sub	r12, r12, #1
   1787 	b	.L128_dec_blocks_less_than_1
   1788 .L128_dec_blocks_more_than_3:@ blocks left >  3
   1789 	rev64	q4, q5                                    @ GHASH final-3 block
   1790 	ld1	{ q5}, [r0], #16                      @ AES final-2 block - load ciphertext
   1791 
   1792 	eor	q4, q4, q8                           @ feed in partial tag
   1793 
   1794 	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
   1795 	stp	r6, r7, [r2], #16        @ AES final-3 block  - store result
   1796 	eor	q0, q5, q1                            @ AES final-2 block - result
   1797 
   1798 	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
   1799 	mov	r7, v0.d[1]                            @ AES final-2 block - mov high
   1800 
   1801 	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
   1802 	mov	r6, v0.d[0]                            @ AES final-2 block - mov low
   1803 
   1804 	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
   1805 
   1806 	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
   1807 
   1808 	movi	q8, #0                                        @ suppress further partial tag feed in
   1809 	eor	r7, r7, r14                   @ AES final-2 block - round 10 high
   1810 #ifdef __ARMEB__
   1811 	rev	r7, r7
   1812 #endif
   1813 	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
   1814 	eor	r6, r6, r13                   @ AES final-2 block - round 10 low
   1815 #ifdef __ARMEB__
   1816 	rev	r6, r6
   1817 #endif
   1818 .L128_dec_blocks_more_than_2:@ blocks left >  2
   1819 
   1820 	rev64	q4, q5                                    @ GHASH final-2 block
   1821 	ld1	{ q5}, [r0], #16                      @ AES final-1 block - load ciphertext
   1822 
   1823 	eor	q4, q4, q8                           @ feed in partial tag
   1824 
   1825 	eor	q0, q5, q2                            @ AES final-1 block - result
   1826 	stp	r6, r7, [r2], #16        @ AES final-2 block  - store result
   1827 
   1828 	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
   1829 
   1830 	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
   1831 
   1832 	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
   1833 	mov	r6, v0.d[0]                            @ AES final-1 block - mov low
   1834 
   1835 	mov	r7, v0.d[1]                            @ AES final-1 block - mov high
   1836 	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
   1837 
   1838 	movi	q8, #0                                        @ suppress further partial tag feed in
   1839 
   1840 	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
   1841 
   1842 	eor	r6, r6, r13                   @ AES final-1 block - round 10 low
   1843 #ifdef __ARMEB__
   1844 	rev	r6, r6
   1845 #endif
   1846 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
   1847 
   1848 	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
   1849 
   1850 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
   1851 	eor	r7, r7, r14                   @ AES final-1 block - round 10 high
   1852 #ifdef __ARMEB__
   1853 	rev	r7, r7
   1854 #endif
   1855 .L128_dec_blocks_more_than_1:@ blocks left >  1
   1856 
   1857 	rev64	q4, q5                                    @ GHASH final-1 block
   1858 
   1859 	ld1	{ q5}, [r0], #16                      @ AES final block - load ciphertext
   1860 	eor	q4, q4, q8                           @ feed in partial tag
   1861 
   1862 	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
   1863 
   1864 	eor	q0, q5, q3                            @ AES final block - result
   1865 
   1866 	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
   1867 
   1868 	stp	r6, r7, [r2], #16        @ AES final-1 block  - store result
   1869 	mov	r6, v0.d[0]                            @ AES final block - mov low
   1870 
   1871 	mov	r7, v0.d[1]                            @ AES final block - mov high
   1872 	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
   1873 
   1874 	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
   1875 
   1876 	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
   1877 
   1878 	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
   1879 	movi	q8, #0                                        @ suppress further partial tag feed in
   1880 
   1881 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
   1882 
   1883 	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
   1884 	eor	r7, r7, r14                   @ AES final block - round 10 high
   1885 #ifdef __ARMEB__
   1886 	rev	r7, r7
   1887 #endif
   1888 	eor	r6, r6, r13                   @ AES final block - round 10 low
   1889 #ifdef __ARMEB__
   1890 	rev	r6, r6
   1891 #endif
   1892 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
   1893 .L128_dec_blocks_less_than_1:@ blocks left <= 1
   1894 
   1895 	mvn	r14, xzr                                      @ rk10_h = 0xffffffffffffffff
   1896 	and	r1, r1, #127                    @ bit_length %= 128
   1897 
   1898 	mvn	r13, xzr                                      @ rk10_l = 0xffffffffffffffff
   1899 	sub	r1, r1, #128                    @ bit_length -= 128
   1900 
   1901 	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
   1902 
   1903 	and	r1, r1, #127                    @ bit_length %= 128
   1904 
   1905 	lsr	r14, r14, r1                     @ rk10_h is mask for top 64b of last block
   1906 	cmp	r1, #64
   1907 
   1908 	csel	r10, r14, xzr, lt
   1909 	csel	r9, r13, r14, lt
   1910 
   1911 	fmov	d0, r9                                   @ ctr0b is mask for last block
   1912 
   1913 	mov	v0.d[1], r10
   1914 
   1915 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
   1916 
   1917 	rev64	q4, q5                                    @ GHASH final block
   1918 
   1919 	eor	q4, q4, q8                           @ feed in partial tag
   1920 
   1921 	ldp	r4, r5, [r2] @ load existing bytes we need to not overwrite
   1922 
   1923 	and	r7, r7, r10
   1924 
   1925 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
   1926 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
   1927 
   1928 	eor	q8, q8, q4                          @ GHASH final block - mid
   1929 	eor	q9, q9, v20.16b                            @ GHASH final block - high
   1930 
   1931 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
   1932 
   1933 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
   1934 	bic	r4, r4, r9           @ mask out low existing bytes
   1935 	and	r6, r6, r9
   1936 
   1937 #ifndef __ARMEB__
   1938 	rev	r9, r12
   1939 #else
   1940 	mov	r9, r12
   1941 #endif
   1942 
   1943 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
   1944 	movi	q8, #0xc2
   1945 
   1946 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
   1947 
   1948 	bic	r5, r5, r10   @ mask out high existing bytes
   1949 	shl	d8, d8, #56               @ mod_constant
   1950 
   1951 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   1952 
   1953 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   1954 
   1955 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   1956 
   1957 	orr	r6, r6, r4
   1958 	str	r9, [r16, #12]                          @ store the updated counter
   1959 
   1960 	orr	r7, r7, r5
   1961 	stp	r6, r7, [r2]
   1962 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   1963 
   1964 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   1965 
   1966 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   1967 
   1968 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   1969 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   1970 
   1971 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   1972 
   1973 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   1974 	ext	v11.16b, v11.16b, v11.16b, #8
   1975 	rev64	v11.16b, v11.16b
   1976 	mov	r0, r15
   1977 	st1	{ v11.16b }, [r3]
   1978 
   1979 	ldp	r21, r22, [sp, #16]
   1980 	ldp	r23, r24, [sp, #32]
   1981 	ldp	d8, d9, [sp, #48]
   1982 	ldp	d10, d11, [sp, #64]
   1983 	ldp	d12, d13, [sp, #80]
   1984 	ldp	d14, d15, [sp, #96]
   1985 	ldp	r19, r20, [sp], #112
   1986 	RET
   1987 
   1988 .L128_dec_ret:
   1989 	mov	r0, #0x0
   1990 	RET
   1991 .size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
   1992 .globl	aes_gcm_enc_192_kernel
   1993 .type	aes_gcm_enc_192_kernel,%function
   1994 .align	4
   1995 aes_gcm_enc_192_kernel:
   1996 	AARCH64_VALID_CALL_TARGET
   1997 	cbz	r1, .L192_enc_ret
   1998 	stp	r19, r20, [sp, #-112]!
   1999 	mov	r16, r4
   2000 	mov	r8, r5
   2001 	stp	r21, r22, [sp, #16]
   2002 	stp	r23, r24, [sp, #32]
   2003 	stp	d8, d9, [sp, #48]
   2004 	stp	d10, d11, [sp, #64]
   2005 	stp	d12, d13, [sp, #80]
   2006 	stp	d14, d15, [sp, #96]
   2007 
   2008 	ldp	r10, r11, [r16]             @ ctr96_b64, ctr96_t32
   2009 #ifdef __ARMEB__
   2010 	rev	r10, r10
   2011 	rev	r11, r11
   2012 #endif
   2013 	ldp	r13, r14, [r8, #192]                     @ load rk12
   2014 #ifdef __ARMEB__
   2015 	ror	r13, r13, #32
   2016 	ror	r14, r14, #32
   2017 #endif
   2018 	ld1	{v18.4s}, [r8], #16	                             @ load rk0
   2019 
   2020 	ld1	{v19.4s}, [r8], #16	                             @ load rk1
   2021 
   2022 	ld1	{v20.4s}, [r8], #16	                             @ load rk2
   2023 
   2024 	lsr	r12, r11, #32
   2025 	ld1	{v21.4s}, [r8], #16	                             @ load rk3
   2026 	orr	r11, r11, r11
   2027 
   2028 	ld1	{v22.4s}, [r8], #16	                             @ load rk4
   2029 	rev	r12, r12                               @ rev_ctr32
   2030 
   2031 	add	r12, r12, #1                           @ increment rev_ctr32
   2032 	fmov	d3, r10                              @ CTR block 3
   2033 
   2034 	rev	r9, r12                                @ CTR block 1
   2035 	add	r12, r12, #1                           @ CTR block 1
   2036 	fmov	d1, r10                              @ CTR block 1
   2037 
   2038 	orr	r9, r11, r9, lsl #32           @ CTR block 1
   2039 	ld1	{ q0}, [r16]                            @ special case vector load initial counter so we can start first AES block as quickly as possible
   2040 
   2041 	fmov	v1.d[1], r9                              @ CTR block 1
   2042 	rev	r9, r12                                @ CTR block 2
   2043 	add	r12, r12, #1                           @ CTR block 2
   2044 
   2045 	fmov	d2, r10                              @ CTR block 2
   2046 	orr	r9, r11, r9, lsl #32           @ CTR block 2
   2047 
   2048 	fmov	v2.d[1], r9                              @ CTR block 2
   2049 	rev	r9, r12                                @ CTR block 3
   2050 
   2051 	orr	r9, r11, r9, lsl #32           @ CTR block 3
   2052 	ld1	{v23.4s}, [r8], #16	                             @ load rk5
   2053 
   2054 	fmov	v3.d[1], r9                              @ CTR block 3
   2055 
   2056 	ld1	{v24.4s}, [r8], #16	                             @ load rk6
   2057 
   2058 	ld1	{v25.4s}, [r8], #16	                             @ load rk7
   2059 
   2060 	aese	q0, v18.16b
   2061 	aesmc	q0, q0         @ AES block 0 - round 0
   2062 	ld1	{ v11.16b}, [r3]
   2063 	ext	v11.16b, v11.16b, v11.16b, #8
   2064 	rev64	v11.16b, v11.16b
   2065 
   2066 	aese	q3, v18.16b
   2067 	aesmc	q3, q3         @ AES block 3 - round 0
   2068 	ld1	{v26.4s}, [r8], #16	                             @ load rk8
   2069 
   2070 	aese	q1, v18.16b
   2071 	aesmc	q1, q1         @ AES block 1 - round 0
   2072 	ldr	q15, [r3, #112]                       @ load h4l | h4h
   2073 #ifndef __ARMEB__
   2074 	ext	v15.16b, v15.16b, v15.16b, #8
   2075 #endif
   2076 	aese	q2, v18.16b
   2077 	aesmc	q2, q2         @ AES block 2 - round 0
   2078 	ld1	{v27.4s}, [r8], #16	                             @ load rk9
   2079 
   2080 	aese	q0, v19.16b
   2081 	aesmc	q0, q0         @ AES block 0 - round 1
   2082 	ld1	{v28.4s}, [r8], #16	                         @ load rk10
   2083 
   2084 	aese	q1, v19.16b
   2085 	aesmc	q1, q1         @ AES block 1 - round 1
   2086 	ldr	q12, [r3, #32]                        @ load h1l | h1h
   2087 #ifndef __ARMEB__
   2088 	ext	v12.16b, v12.16b, v12.16b, #8
   2089 #endif
   2090 	aese	q2, v19.16b
   2091 	aesmc	q2, q2         @ AES block 2 - round 1
   2092 	ld1	{v29.4s}, [r8], #16	                         @ load rk11
   2093 
   2094 	aese	q3, v19.16b
   2095 	aesmc	q3, q3         @ AES block 3 - round 1
   2096 	ldr	q14, [r3, #80]                        @ load h3l | h3h
   2097 #ifndef __ARMEB__
   2098 	ext	v14.16b, v14.16b, v14.16b, #8
   2099 #endif
   2100 	aese	q0, v20.16b
   2101 	aesmc	q0, q0         @ AES block 0 - round 2
   2102 
   2103 	aese	q2, v20.16b
   2104 	aesmc	q2, q2         @ AES block 2 - round 2
   2105 
   2106 	aese	q3, v20.16b
   2107 	aesmc	q3, q3         @ AES block 3 - round 2
   2108 
   2109 	aese	q0, v21.16b
   2110 	aesmc	q0, q0         @ AES block 0 - round 3
   2111 	trn1	q9, v14.2d,    v15.2d                     @ h4h | h3h
   2112 
   2113 	aese	q2, v21.16b
   2114 	aesmc	q2, q2         @ AES block 2 - round 3
   2115 
   2116 	aese	q1, v20.16b
   2117 	aesmc	q1, q1         @ AES block 1 - round 2
   2118 	trn2	v17.2d,  v14.2d,    v15.2d                     @ h4l | h3l
   2119 
   2120 	aese	q0, v22.16b
   2121 	aesmc	q0, q0         @ AES block 0 - round 4
   2122 
   2123 	aese	q3, v21.16b
   2124 	aesmc	q3, q3         @ AES block 3 - round 3
   2125 
   2126 	aese	q1, v21.16b
   2127 	aesmc	q1, q1         @ AES block 1 - round 3
   2128 
   2129 	aese	q0, v23.16b
   2130 	aesmc	q0, q0         @ AES block 0 - round 5
   2131 
   2132 	aese	q2, v22.16b
   2133 	aesmc	q2, q2         @ AES block 2 - round 4
   2134 
   2135 	aese	q1, v22.16b
   2136 	aesmc	q1, q1         @ AES block 1 - round 4
   2137 
   2138 	aese	q0, v24.16b
   2139 	aesmc	q0, q0         @ AES block 0 - round 6
   2140 
   2141 	aese	q3, v22.16b
   2142 	aesmc	q3, q3         @ AES block 3 - round 4
   2143 
   2144 	aese	q2, v23.16b
   2145 	aesmc	q2, q2         @ AES block 2 - round 5
   2146 
   2147 	aese	q1, v23.16b
   2148 	aesmc	q1, q1         @ AES block 1 - round 5
   2149 
   2150 	aese	q3, v23.16b
   2151 	aesmc	q3, q3         @ AES block 3 - round 5
   2152 
   2153 	aese	q2, v24.16b
   2154 	aesmc	q2, q2         @ AES block 2 - round 6
   2155 	ldr	q13, [r3, #64]                        @ load h2l | h2h
   2156 #ifndef __ARMEB__
   2157 	ext	v13.16b, v13.16b, v13.16b, #8
   2158 #endif
   2159 	aese	q1, v24.16b
   2160 	aesmc	q1, q1         @ AES block 1 - round 6
   2161 
   2162 	aese	q3, v24.16b
   2163 	aesmc	q3, q3         @ AES block 3 - round 6
   2164 
   2165 	aese	q0, v25.16b
   2166 	aesmc	q0, q0         @ AES block 0 - round 7
   2167 
   2168 	aese	q1, v25.16b
   2169 	aesmc	q1, q1         @ AES block 1 - round 7
   2170 	trn2	v16.2d,  v12.2d,    v13.2d                     @ h2l | h1l
   2171 
   2172 	aese	q3, v25.16b
   2173 	aesmc	q3, q3         @ AES block 3 - round 7
   2174 
   2175 	aese	q0, v26.16b
   2176 	aesmc	q0, q0         @ AES block 0 - round 8
   2177 
   2178 	aese	q2, v25.16b
   2179 	aesmc	q2, q2         @ AES block 2 - round 7
   2180 	trn1	q8,    v12.2d,    v13.2d                     @ h2h | h1h
   2181 
   2182 	aese	q1, v26.16b
   2183 	aesmc	q1, q1         @ AES block 1 - round 8
   2184 
   2185 	aese	q3, v26.16b
   2186 	aesmc	q3, q3         @ AES block 3 - round 8
   2187 
   2188 	aese	q2, v26.16b
   2189 	aesmc	q2, q2         @ AES block 2 - round 8
   2190 
   2191 	aese	q0, v27.16b
   2192 	aesmc	q0, q0         @ AES block 0 - round 9
   2193 
   2194 	aese	q3, v27.16b
   2195 	aesmc	q3, q3         @ AES block 3 - round 9
   2196 
   2197 	aese	q2, v27.16b
   2198 	aesmc	q2, q2         @ AES block 2 - round 9
   2199 
   2200 	aese	q1, v27.16b
   2201 	aesmc	q1, q1         @ AES block 1 - round 9
   2202 
   2203 	aese	q0, v28.16b
   2204 	aesmc	q0, q0         @ AES block 0 - round 10
   2205 
   2206 	aese	q2, v28.16b
   2207 	aesmc	q2, q2         @ AES block 2 - round 10
   2208 
   2209 	aese	q1, v28.16b
   2210 	aesmc	q1, q1         @ AES block 1 - round 10
   2211 	lsr	r5, r1, #3             @ byte_len
   2212 	mov	r15, r5
   2213 
   2214 	aese	q3, v28.16b
   2215 	aesmc	q3, q3         @ AES block 3 - round 10
   2216 	sub	r5, r5, #1     @ byte_len - 1
   2217 
   2218 	eor	v16.16b, v16.16b, q8                    @ h2k | h1k
   2219 	and	r5, r5, #0xffffffffffffffc0   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   2220 
   2221 	eor	v17.16b, v17.16b, q9                 @ h4k | h3k
   2222 
   2223 	aese	q2, v29.16b                                    @ AES block 2 - round 11
   2224 	add	r4, r0, r1, lsr #3  @ end_input_ptr
   2225 	add	r5, r5, r0
   2226 
   2227 	aese	q1, v29.16b                                    @ AES block 1 - round 11
   2228 	cmp	r0, r5                  @ check if we have <= 4 blocks
   2229 
   2230 	aese	q0, v29.16b                                    @ AES block 0 - round 11
   2231 	add	r12, r12, #1                           @ CTR block 3
   2232 
   2233 	aese	q3, v29.16b                                    @ AES block 3 - round 11
   2234 	bge	.L192_enc_tail                                   @ handle tail
   2235 
   2236 	rev	r9, r12                                @ CTR block 4
   2237 	ldp	r6, r7, [r0, #0]           @ AES block 0 - load plaintext
   2238 #ifdef __ARMEB__
   2239 	rev	r6, r6
   2240 	rev	r7, r7
   2241 #endif
   2242 	orr	r9, r11, r9, lsl #32           @ CTR block 4
   2243 	ldp	r21, r22, [r0, #32]          @ AES block 2 - load plaintext
   2244 #ifdef __ARMEB__
   2245 	rev	r21, r21
   2246 	rev	r22, r22
   2247 #endif
   2248 	ldp	r23, r24, [r0, #48]          @ AES block 3 - load plaintext
   2249 #ifdef __ARMEB__
   2250 	rev	r23, r23
   2251 	rev	r24, r24
   2252 #endif
   2253 	ldp	r19, r20, [r0, #16]          @ AES block 1 - load plaintext
   2254 #ifdef __ARMEB__
   2255 	rev	r19, r19
   2256 	rev	r20, r20
   2257 #endif
   2258 	add	r0, r0, #64                      @ AES input_ptr update
   2259 	cmp	r0, r5                  @ check if we have <= 8 blocks
   2260 
   2261 	eor	r6, r6, r13                    @ AES block 0 - round 12 low
   2262 
   2263 	eor	r7, r7, r14                    @ AES block 0 - round 12 high
   2264 	eor	r22, r22, r14                    @ AES block 2 - round 12 high
   2265 	fmov	d4, r6                              @ AES block 0 - mov low
   2266 
   2267 	eor	r24, r24, r14                    @ AES block 3 - round 12 high
   2268 	fmov	v4.d[1], r7                          @ AES block 0 - mov high
   2269 
   2270 	eor	r21, r21, r13                    @ AES block 2 - round 12 low
   2271 	eor	r19, r19, r13                    @ AES block 1 - round 12 low
   2272 
   2273 	fmov	d5, r19                              @ AES block 1 - mov low
   2274 	eor	r20, r20, r14                    @ AES block 1 - round 12 high
   2275 
   2276 	fmov	v5.d[1], r20                          @ AES block 1 - mov high
   2277 
   2278 	eor	r23, r23, r13                    @ AES block 3 - round 12 low
   2279 	fmov	d6, r21                              @ AES block 2 - mov low
   2280 
   2281 	add	r12, r12, #1                           @ CTR block 4
   2282 	eor	q4, q4, q0                         @ AES block 0 - result
   2283 	fmov	d0, r10                              @ CTR block 4
   2284 
   2285 	fmov	v0.d[1], r9                              @ CTR block 4
   2286 	rev	r9, r12                                @ CTR block 5
   2287 
   2288 	orr	r9, r11, r9, lsl #32           @ CTR block 5
   2289 	add	r12, r12, #1                           @ CTR block 5
   2290 
   2291 	fmov	d7, r23                              @ AES block 3 - mov low
   2292 	st1	{ q4}, [r2], #16                    @ AES block 0 - store result
   2293 
   2294 	fmov	v6.d[1], r22                          @ AES block 2 - mov high
   2295 
   2296 	eor	q5, q5, q1                         @ AES block 1 - result
   2297 	fmov	d1, r10                              @ CTR block 5
   2298 	st1	{ q5}, [r2], #16                    @ AES block 1 - store result
   2299 
   2300 	fmov	v7.d[1], r24                          @ AES block 3 - mov high
   2301 
   2302 	fmov	v1.d[1], r9                              @ CTR block 5
   2303 	rev	r9, r12                                @ CTR block 6
   2304 
   2305 	orr	r9, r11, r9, lsl #32           @ CTR block 6
   2306 
   2307 	add	r12, r12, #1                           @ CTR block 6
   2308 	eor	q6, q6, q2                         @ AES block 2 - result
   2309 	fmov	d2, r10                              @ CTR block 6
   2310 
   2311 	fmov	v2.d[1], r9                              @ CTR block 6
   2312 	rev	r9, r12                                @ CTR block 7
   2313 
   2314 	orr	r9, r11, r9, lsl #32           @ CTR block 7
   2315 	st1	{ q6}, [r2], #16                    @ AES block 2 - store result
   2316 
   2317 	eor	q7, q7, q3                         @ AES block 3 - result
   2318 	st1	{ q7}, [r2], #16                    @ AES block 3 - store result
   2319 	bge	.L192_enc_prepretail                             @ do prepretail
   2320 
   2321 .L192_enc_main_loop:@ main loop start
   2322 	aese	q2, v18.16b
   2323 	aesmc	q2, q2         @ AES block 4k+6 - round 0
   2324 	rev64	q5, q5                                   @ GHASH block 4k+1 (t0 and t1 free)
   2325 
   2326 	aese	q1, v18.16b
   2327 	aesmc	q1, q1         @ AES block 4k+5 - round 0
   2328 	ldp	r19, r20, [r0, #16]          @ AES block 4k+5 - load plaintext
   2329 #ifdef __ARMEB__
   2330 	rev	r19, r19
   2331 	rev	r20, r20
   2332 #endif
   2333 	ext	v11.16b, v11.16b, v11.16b, #8                    @ PRE 0
   2334 	fmov	d3, r10                              @ CTR block 4k+3
   2335 	rev64	q4, q4                                   @ GHASH block 4k (only t0 is free)
   2336 
   2337 	aese	q2, v19.16b
   2338 	aesmc	q2, q2         @ AES block 4k+6 - round 1
   2339 	fmov	v3.d[1], r9                              @ CTR block 4k+3
   2340 
   2341 	pmull2	v30.1q, q5, v14.2d                         @ GHASH block 4k+1 - high
   2342 	rev64	q7, q7                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2343 	ldp	r21, r22, [r0, #32]          @ AES block 4k+6 - load plaintext
   2344 #ifdef __ARMEB__
   2345 	rev	r21, r21
   2346 	rev	r22, r22
   2347 #endif
   2348 	aese	q0, v18.16b
   2349 	aesmc	q0, q0         @ AES block 4k+4 - round 0
   2350 	ldp	r23, r24, [r0, #48]          @ AES block 4k+3 - load plaintext
   2351 #ifdef __ARMEB__
   2352 	rev	r23, r23
   2353 	rev	r24, r24
   2354 #endif
   2355 	pmull	v31.1q, q5, v14.1d                         @ GHASH block 4k+1 - low
   2356 	eor	q4, q4, v11.16b                          @ PRE 1
   2357 
   2358 	aese	q1, v19.16b
   2359 	aesmc	q1, q1         @ AES block 4k+5 - round 1
   2360 
   2361 	aese	q0, v19.16b
   2362 	aesmc	q0, q0         @ AES block 4k+4 - round 1
   2363 	rev64	q6, q6                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
   2364 
   2365 	aese	q3, v18.16b
   2366 	aesmc	q3, q3         @ AES block 4k+7 - round 0
   2367 	eor	r24, r24, r14                    @ AES block 4k+3 - round 12 high
   2368 
   2369 	pmull	v11.1q, q4, v15.1d                      @ GHASH block 4k - low
   2370 	mov	d8, v4.d[1]                                 @ GHASH block 4k - mid
   2371 
   2372 	aese	q0, v20.16b
   2373 	aesmc	q0, q0         @ AES block 4k+4 - round 2
   2374 
   2375 	aese	q3, v19.16b
   2376 	aesmc	q3, q3         @ AES block 4k+7 - round 1
   2377 	eor	r21, r21, r13                    @ AES block 4k+6 - round 12 low
   2378 
   2379 	eor	q8, q8, q4                         @ GHASH block 4k - mid
   2380 	eor	v11.16b, v11.16b, v31.16b                        @ GHASH block 4k+1 - low
   2381 
   2382 	aese	q0, v21.16b
   2383 	aesmc	q0, q0         @ AES block 4k+4 - round 3
   2384 	eor	r19, r19, r13                    @ AES block 4k+5 - round 12 low
   2385 
   2386 	aese	q1, v20.16b
   2387 	aesmc	q1, q1         @ AES block 4k+5 - round 2
   2388 	mov	d31, v6.d[1]                                 @ GHASH block 4k+2 - mid
   2389 
   2390 	pmull2	v9.1q, q4, v15.2d                      @ GHASH block 4k - high
   2391 	mov	d4, v5.d[1]                                 @ GHASH block 4k+1 - mid
   2392 
   2393 	aese	q2, v20.16b
   2394 	aesmc	q2, q2         @ AES block 4k+6 - round 2
   2395 
   2396 	aese	q1, v21.16b
   2397 	aesmc	q1, q1         @ AES block 4k+5 - round 3
   2398 
   2399 	mov	d10, v17.d[1]                              @ GHASH block 4k - mid
   2400 	eor	q9, q9, v30.16b                        @ GHASH block 4k+1 - high
   2401 
   2402 	aese	q3, v20.16b
   2403 	aesmc	q3, q3         @ AES block 4k+7 - round 2
   2404 	eor	v31.8b, v31.8b, q6                         @ GHASH block 4k+2 - mid
   2405 
   2406 	pmull2	v30.1q, q6, v13.2d                         @ GHASH block 4k+2 - high
   2407 
   2408 	aese	q0, v22.16b
   2409 	aesmc	q0, q0         @ AES block 4k+4 - round 4
   2410 	eor	q4, q4, q5                         @ GHASH block 4k+1 - mid
   2411 
   2412 	aese	q3, v21.16b
   2413 	aesmc	q3, q3         @ AES block 4k+7 - round 3
   2414 
   2415 	pmull2	v5.1q, q7, v12.2d                         @ GHASH block 4k+3 - high
   2416 	eor	r20, r20, r14                    @ AES block 4k+5 - round 12 high
   2417 	ins	v31.d[1], v31.d[0]                               @ GHASH block 4k+2 - mid
   2418 
   2419 	aese	q0, v23.16b
   2420 	aesmc	q0, q0         @ AES block 4k+4 - round 5
   2421 	add	r12, r12, #1                           @ CTR block 4k+3
   2422 
   2423 	aese	q3, v22.16b
   2424 	aesmc	q3, q3         @ AES block 4k+7 - round 4
   2425 	eor	q9, q9, v30.16b                        @ GHASH block 4k+2 - high
   2426 
   2427 	pmull	v4.1q, q4, v17.1d                         @ GHASH block 4k+1 - mid
   2428 	eor	r22, r22, r14                    @ AES block 4k+6 - round 12 high
   2429 
   2430 	pmull2	v31.1q, v31.2d, v16.2d                         @ GHASH block 4k+2 - mid
   2431 	eor	r23, r23, r13                    @ AES block 4k+3 - round 12 low
   2432 	mov	d30, v7.d[1]                                 @ GHASH block 4k+3 - mid
   2433 
   2434 	pmull	v10.1q, q8, v10.1d                     @ GHASH block 4k - mid
   2435 	rev	r9, r12                                @ CTR block 4k+8
   2436 
   2437 	pmull	v8.1q, q6, v13.1d                         @ GHASH block 4k+2 - low
   2438 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+8
   2439 
   2440 	aese	q2, v21.16b
   2441 	aesmc	q2, q2         @ AES block 4k+6 - round 3
   2442 	eor	v30.8b, v30.8b, q7                         @ GHASH block 4k+3 - mid
   2443 
   2444 	aese	q1, v22.16b
   2445 	aesmc	q1, q1         @ AES block 4k+5 - round 4
   2446 	ldp	r6, r7, [r0, #0]           @ AES block 4k+4 - load plaintext
   2447 #ifdef __ARMEB__
   2448 	rev	r6, r6
   2449 	rev	r7, r7
   2450 #endif
   2451 	aese	q0, v24.16b
   2452 	aesmc	q0, q0         @ AES block 4k+4 - round 6
   2453 	eor	v11.16b, v11.16b, q8                        @ GHASH block 4k+2 - low
   2454 
   2455 	aese	q2, v22.16b
   2456 	aesmc	q2, q2         @ AES block 4k+6 - round 4
   2457 	add	r0, r0, #64                      @ AES input_ptr update
   2458 
   2459 	aese	q1, v23.16b
   2460 	aesmc	q1, q1         @ AES block 4k+5 - round 5
   2461 	movi	q8, #0xc2
   2462 
   2463 	pmull	v6.1q, q7, v12.1d                         @ GHASH block 4k+3 - low
   2464 	eor	r7, r7, r14                    @ AES block 4k+4 - round 12 high
   2465 	eor	v10.16b, v10.16b, q4                        @ GHASH block 4k+1 - mid
   2466 
   2467 	aese	q2, v23.16b
   2468 	aesmc	q2, q2         @ AES block 4k+6 - round 5
   2469 	eor	r6, r6, r13                    @ AES block 4k+4 - round 12 low
   2470 
   2471 	aese	q1, v24.16b
   2472 	aesmc	q1, q1         @ AES block 4k+5 - round 6
   2473 	shl	d8, d8, #56              @ mod_constant
   2474 
   2475 	aese	q3, v23.16b
   2476 	aesmc	q3, q3         @ AES block 4k+7 - round 5
   2477 	eor	q9, q9, q5                        @ GHASH block 4k+3 - high
   2478 
   2479 	aese	q0, v25.16b
   2480 	aesmc	q0, q0         @ AES block 4k+4 - round 7
   2481 	fmov	d5, r19                              @ AES block 4k+5 - mov low
   2482 
   2483 	aese	q1, v25.16b
   2484 	aesmc	q1, q1         @ AES block 4k+5 - round 7
   2485 	eor	v10.16b, v10.16b, v31.16b                        @ GHASH block 4k+2 - mid
   2486 
   2487 	aese	q3, v24.16b
   2488 	aesmc	q3, q3         @ AES block 4k+7 - round 6
   2489 	fmov	v5.d[1], r20                          @ AES block 4k+5 - mov high
   2490 
   2491 	aese	q0, v26.16b
   2492 	aesmc	q0, q0         @ AES block 4k+4 - round 8
   2493 	eor	v11.16b, v11.16b, q6                        @ GHASH block 4k+3 - low
   2494 
   2495 	pmull	v30.1q, v30.1d, v16.1d                         @ GHASH block 4k+3 - mid
   2496 	cmp	r0, r5                  @ .LOOP CONTROL
   2497 	fmov	d4, r6                              @ AES block 4k+4 - mov low
   2498 
   2499 	aese	q2, v24.16b
   2500 	aesmc	q2, q2         @ AES block 4k+6 - round 6
   2501 	fmov	v4.d[1], r7                          @ AES block 4k+4 - mov high
   2502 
   2503 	aese	q1, v26.16b
   2504 	aesmc	q1, q1         @ AES block 4k+5 - round 8
   2505 	fmov	d7, r23                              @ AES block 4k+3 - mov low
   2506 
   2507 	eor	v10.16b, v10.16b, v30.16b                        @ GHASH block 4k+3 - mid
   2508 	eor	v30.16b, v11.16b, q9                        @ MODULO - karatsuba tidy up
   2509 	add	r12, r12, #1                           @ CTR block 4k+8
   2510 
   2511 	aese	q2, v25.16b
   2512 	aesmc	q2, q2         @ AES block 4k+6 - round 7
   2513 	fmov	v7.d[1], r24                          @ AES block 4k+3 - mov high
   2514 
   2515 	pmull	v31.1q, q9, q8           @ MODULO - top 64b align with mid
   2516 	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
   2517 	fmov	d6, r21                              @ AES block 4k+6 - mov low
   2518 
   2519 	aese	q3, v25.16b
   2520 	aesmc	q3, q3         @ AES block 4k+7 - round 7
   2521 
   2522 	aese	q0, v27.16b
   2523 	aesmc	q0, q0         @ AES block 4k+4 - round 9
   2524 	eor	v10.16b, v10.16b, v30.16b                        @ MODULO - karatsuba tidy up
   2525 
   2526 	aese	q2, v26.16b
   2527 	aesmc	q2, q2         @ AES block 4k+6 - round 8
   2528 
   2529 	aese	q3, v26.16b
   2530 	aesmc	q3, q3         @ AES block 4k+7 - round 8
   2531 
   2532 	aese	q1, v27.16b
   2533 	aesmc	q1, q1         @ AES block 4k+5 - round 9
   2534 
   2535 	aese	q0, v28.16b
   2536 	aesmc	q0, q0         @ AES block 4k+4 - round 10
   2537 	eor	v10.16b, v10.16b, v31.16b                     @ MODULO - fold into mid
   2538 
   2539 	aese	q3, v27.16b
   2540 	aesmc	q3, q3         @ AES block 4k+7 - round 9
   2541 
   2542 	aese	q2, v27.16b
   2543 	aesmc	q2, q2         @ AES block 4k+6 - round 9
   2544 
   2545 	aese	q0, v29.16b                                    @ AES block 4k+4 - round 11
   2546 
   2547 	aese	q1, v28.16b
   2548 	aesmc	q1, q1         @ AES block 4k+5 - round 10
   2549 	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
   2550 
   2551 	aese	q2, v28.16b
   2552 	aesmc	q2, q2         @ AES block 4k+6 - round 10
   2553 
   2554 	eor	q4, q4, q0                         @ AES block 4k+4 - result
   2555 	fmov	d0, r10                              @ CTR block 4k+8
   2556 
   2557 	aese	q1, v29.16b                                    @ AES block 4k+5 - round 11
   2558 	fmov	v0.d[1], r9                              @ CTR block 4k+8
   2559 	rev	r9, r12                                @ CTR block 4k+9
   2560 
   2561 	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
   2562 	fmov	v6.d[1], r22                          @ AES block 4k+6 - mov high
   2563 	st1	{ q4}, [r2], #16                    @ AES block 4k+4 - store result
   2564 
   2565 	aese	q3, v28.16b
   2566 	aesmc	q3, q3         @ AES block 4k+7 - round 10
   2567 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+9
   2568 
   2569 	eor	q5, q5, q1                         @ AES block 4k+5 - result
   2570 	add	r12, r12, #1                           @ CTR block 4k+9
   2571 	fmov	d1, r10                              @ CTR block 4k+9
   2572 
   2573 	aese	q2, v29.16b                                    @ AES block 4k+6 - round 11
   2574 	fmov	v1.d[1], r9                              @ CTR block 4k+9
   2575 	rev	r9, r12                                @ CTR block 4k+10
   2576 
   2577 	add	r12, r12, #1                           @ CTR block 4k+10
   2578 	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
   2579 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+10
   2580 
   2581 	st1	{ q5}, [r2], #16                    @ AES block 4k+5 - store result
   2582 	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
   2583 
   2584 	aese	q3, v29.16b                                    @ AES block 4k+7 - round 11
   2585 	eor	q6, q6, q2                         @ AES block 4k+6 - result
   2586 	fmov	d2, r10                              @ CTR block 4k+10
   2587 
   2588 	st1	{ q6}, [r2], #16                    @ AES block 4k+6 - store result
   2589 	fmov	v2.d[1], r9                              @ CTR block 4k+10
   2590 	rev	r9, r12                                @ CTR block 4k+11
   2591 
   2592 	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
   2593 	orr	r9, r11, r9, lsl #32           @ CTR block 4k+11
   2594 
   2595 	eor	q7, q7, q3                         @ AES block 4k+3 - result
   2596 	st1	{ q7}, [r2], #16                    @ AES block 4k+3 - store result
   2597 	blt	.L192_enc_main_loop
   2598 
   2599 .L192_enc_prepretail:@ PREPRETAIL
   2600 	aese	q0, v18.16b
   2601 	aesmc	q0, q0         @ AES block 4k+4 - round 0
   2602 	rev64	q4, q4                                   @ GHASH block 4k (only t0 is free)
   2603 
   2604 	fmov	d3, r10                              @ CTR block 4k+3
   2605 	ext	v11.16b, v11.16b, v11.16b, #8                    @ PRE 0
   2606 	add	r12, r12, #1                           @ CTR block 4k+3
   2607 
   2608 	aese	q1, v18.16b
   2609 	aesmc	q1, q1         @ AES block 4k+5 - round 0
   2610 	rev64	q5, q5                                   @ GHASH block 4k+1 (t0 and t1 free)
   2611 
   2612 	aese	q2, v18.16b
   2613 	aesmc	q2, q2         @ AES block 4k+6 - round 0
   2614 
   2615 	fmov	v3.d[1], r9                              @ CTR block 4k+3
   2616 	eor	q4, q4, v11.16b                          @ PRE 1
   2617 	mov	d10, v17.d[1]                              @ GHASH block 4k - mid
   2618 
   2619 	aese	q1, v19.16b
   2620 	aesmc	q1, q1         @ AES block 4k+5 - round 1
   2621 	rev64	q6, q6                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
   2622 
   2623 	pmull2	v30.1q, q5, v14.2d                         @ GHASH block 4k+1 - high
   2624 
   2625 	pmull	v11.1q, q4, v15.1d                      @ GHASH block 4k - low
   2626 	mov	d8, v4.d[1]                                 @ GHASH block 4k - mid
   2627 
   2628 	pmull	v31.1q, q5, v14.1d                         @ GHASH block 4k+1 - low
   2629 	rev64	q7, q7                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2630 
   2631 	pmull2	v9.1q, q4, v15.2d                      @ GHASH block 4k - high
   2632 
   2633 	eor	q8, q8, q4                         @ GHASH block 4k - mid
   2634 	mov	d4, v5.d[1]                                 @ GHASH block 4k+1 - mid
   2635 
   2636 	eor	v11.16b, v11.16b, v31.16b                        @ GHASH block 4k+1 - low
   2637 	mov	d31, v6.d[1]                                 @ GHASH block 4k+2 - mid
   2638 
   2639 	aese	q3, v18.16b
   2640 	aesmc	q3, q3         @ AES block 4k+7 - round 0
   2641 	eor	q9, q9, v30.16b                        @ GHASH block 4k+1 - high
   2642 
   2643 	pmull2	v30.1q, q6, v13.2d                         @ GHASH block 4k+2 - high
   2644 
   2645 	eor	q4, q4, q5                         @ GHASH block 4k+1 - mid
   2646 	eor	v31.8b, v31.8b, q6                         @ GHASH block 4k+2 - mid
   2647 
   2648 	aese	q3, v19.16b
   2649 	aesmc	q3, q3         @ AES block 4k+7 - round 1
   2650 
   2651 	aese	q2, v19.16b
   2652 	aesmc	q2, q2         @ AES block 4k+6 - round 1
   2653 	eor	q9, q9, v30.16b                        @ GHASH block 4k+2 - high
   2654 
   2655 	aese	q0, v19.16b
   2656 	aesmc	q0, q0         @ AES block 4k+4 - round 1
   2657 
   2658 	aese	q1, v20.16b
   2659 	aesmc	q1, q1         @ AES block 4k+5 - round 2
   2660 	mov	d30, v7.d[1]                                 @ GHASH block 4k+3 - mid
   2661 
   2662 	pmull2	v5.1q, q7, v12.2d                         @ GHASH block 4k+3 - high
   2663 	ins	v31.d[1], v31.d[0]                               @ GHASH block 4k+2 - mid
   2664 
   2665 	aese	q0, v20.16b
   2666 	aesmc	q0, q0         @ AES block 4k+4 - round 2
   2667 
   2668 	pmull	v10.1q, q8, v10.1d                     @ GHASH block 4k - mid
   2669 	eor	v30.8b, v30.8b, q7                         @ GHASH block 4k+3 - mid
   2670 
   2671 	aese	q1, v21.16b
   2672 	aesmc	q1, q1         @ AES block 4k+5 - round 3
   2673 
   2674 	pmull2	v31.1q, v31.2d, v16.2d                         @ GHASH block 4k+2 - mid
   2675 
   2676 	pmull	v4.1q, q4, v17.1d                         @ GHASH block 4k+1 - mid
   2677 
   2678 	pmull	v30.1q, v30.1d, v16.1d                         @ GHASH block 4k+3 - mid
   2679 	eor	q9, q9, q5                        @ GHASH block 4k+3 - high
   2680 
   2681 	pmull	v8.1q, q6, v13.1d                         @ GHASH block 4k+2 - low
   2682 
   2683 	aese	q0, v21.16b
   2684 	aesmc	q0, q0         @ AES block 4k+4 - round 3
   2685 	eor	v10.16b, v10.16b, q4                        @ GHASH block 4k+1 - mid
   2686 
   2687 	aese	q3, v20.16b
   2688 	aesmc	q3, q3         @ AES block 4k+7 - round 2
   2689 
   2690 	aese	q2, v20.16b
   2691 	aesmc	q2, q2         @ AES block 4k+6 - round 2
   2692 	eor	v11.16b, v11.16b, q8                        @ GHASH block 4k+2 - low
   2693 
   2694 	aese	q0, v22.16b
   2695 	aesmc	q0, q0         @ AES block 4k+4 - round 4
   2696 
   2697 	aese	q3, v21.16b
   2698 	aesmc	q3, q3         @ AES block 4k+7 - round 3
   2699 	eor	v10.16b, v10.16b, v31.16b                        @ GHASH block 4k+2 - mid
   2700 
   2701 	aese	q2, v21.16b
   2702 	aesmc	q2, q2         @ AES block 4k+6 - round 3
   2703 
   2704 	pmull	v6.1q, q7, v12.1d                         @ GHASH block 4k+3 - low
   2705 	movi	q8, #0xc2
   2706 
   2707 	aese	q3, v22.16b
   2708 	aesmc	q3, q3         @ AES block 4k+7 - round 4
   2709 
   2710 	aese	q2, v22.16b
   2711 	aesmc	q2, q2         @ AES block 4k+6 - round 4
   2712 
   2713 	aese	q1, v22.16b
   2714 	aesmc	q1, q1         @ AES block 4k+5 - round 4
   2715 	eor	v10.16b, v10.16b, v30.16b                        @ GHASH block 4k+3 - mid
   2716 
   2717 	aese	q3, v23.16b
   2718 	aesmc	q3, q3         @ AES block 4k+7 - round 5
   2719 
   2720 	aese	q2, v23.16b
   2721 	aesmc	q2, q2         @ AES block 4k+6 - round 5
   2722 
   2723 	aese	q1, v23.16b
   2724 	aesmc	q1, q1         @ AES block 4k+5 - round 5
   2725 	eor	v11.16b, v11.16b, q6                        @ GHASH block 4k+3 - low
   2726 
   2727 	aese	q0, v23.16b
   2728 	aesmc	q0, q0         @ AES block 4k+4 - round 5
   2729 
   2730 	aese	q3, v24.16b
   2731 	aesmc	q3, q3         @ AES block 4k+7 - round 6
   2732 	eor	v10.16b, v10.16b, q9                        @ karatsuba tidy up
   2733 
   2734 	aese	q1, v24.16b
   2735 	aesmc	q1, q1         @ AES block 4k+5 - round 6
   2736 
   2737 	aese	q0, v24.16b
   2738 	aesmc	q0, q0         @ AES block 4k+4 - round 6
   2739 	shl	d8, d8, #56              @ mod_constant
   2740 
   2741 	aese	q3, v25.16b
   2742 	aesmc	q3, q3         @ AES block 4k+7 - round 7
   2743 
   2744 	aese	q1, v25.16b
   2745 	aesmc	q1, q1         @ AES block 4k+5 - round 7
   2746 	eor	v10.16b, v10.16b, v11.16b
   2747 
   2748 	aese	q0, v25.16b
   2749 	aesmc	q0, q0         @ AES block 4k+4 - round 7
   2750 
   2751 	pmull	v30.1q, q9, q8
   2752 
   2753 	aese	q2, v24.16b
   2754 	aesmc	q2, q2         @ AES block 4k+6 - round 6
   2755 	ext	q9, q9, q9, #8
   2756 
   2757 	aese	q0, v26.16b
   2758 	aesmc	q0, q0         @ AES block 4k+4 - round 8
   2759 
   2760 	aese	q1, v26.16b
   2761 	aesmc	q1, q1         @ AES block 4k+5 - round 8
   2762 	eor	v10.16b, v10.16b, v30.16b
   2763 
   2764 	aese	q2, v25.16b
   2765 	aesmc	q2, q2         @ AES block 4k+6 - round 7
   2766 
   2767 	aese	q3, v26.16b
   2768 	aesmc	q3, q3         @ AES block 4k+7 - round 8
   2769 
   2770 	aese	q0, v27.16b
   2771 	aesmc	q0, q0         @ AES block 4k+4 - round 9
   2772 
   2773 	aese	q2, v26.16b
   2774 	aesmc	q2, q2         @ AES block 4k+6 - round 8
   2775 	eor	v10.16b, v10.16b, q9
   2776 
   2777 	aese	q3, v27.16b
   2778 	aesmc	q3, q3         @ AES block 4k+7 - round 9
   2779 
   2780 	aese	q1, v27.16b
   2781 	aesmc	q1, q1         @ AES block 4k+5 - round 9
   2782 
   2783 	aese	q2, v27.16b
   2784 	aesmc	q2, q2         @ AES block 4k+6 - round 9
   2785 
   2786 	pmull	v30.1q, v10.1d, q8
   2787 
   2788 	ext	v10.16b, v10.16b, v10.16b, #8
   2789 
   2790 	aese	q3, v28.16b
   2791 	aesmc	q3, q3         @ AES block 4k+7 - round 10
   2792 
   2793 	aese	q0, v28.16b
   2794 	aesmc	q0, q0         @ AES block 4k+4 - round 10
   2795 
   2796 	aese	q2, v28.16b
   2797 	aesmc	q2, q2         @ AES block 4k+6 - round 10
   2798 
   2799 	aese	q1, v28.16b
   2800 	aesmc	q1, q1         @ AES block 4k+5 - round 10
   2801 	eor	v11.16b, v11.16b, v30.16b
   2802 
   2803 	aese	q0, v29.16b                                    @ AES block 4k+4 - round 11
   2804 
   2805 	aese	q3, v29.16b                                    @ AES block 4k+7 - round 11
   2806 
   2807 	aese	q2, v29.16b                                    @ AES block 4k+6 - round 11
   2808 
   2809 	aese	q1, v29.16b                                    @ AES block 4k+5 - round 11
   2810 	eor	v11.16b, v11.16b, v10.16b
   2811 .L192_enc_tail:@ TAIL
   2812 
   2813 	sub	r5, r4, r0  @ main_end_input_ptr is number of bytes left to process
   2814 	ldp	r6, r7, [r0], #16          @ AES block 4k+4 - load plaintext
   2815 #ifdef __ARMEB__
   2816 	rev	r6, r6
   2817 	rev	r7, r7
   2818 #endif
   2819 	eor	r6, r6, r13                    @ AES block 4k+4 - round 12 low
   2820 	eor	r7, r7, r14                    @ AES block 4k+4 - round 12 high
   2821 
   2822 	fmov	d4, r6                              @ AES block 4k+4 - mov low
   2823 
   2824 	fmov	v4.d[1], r7                          @ AES block 4k+4 - mov high
   2825 	cmp	r5, #48
   2826 
   2827 	eor	q5, q4, q0                         @ AES block 4k+4 - result
   2828 
   2829 	ext	q8, v11.16b, v11.16b, #8                    @ prepare final partial tag
   2830 	bgt	.L192_enc_blocks_more_than_3
   2831 
   2832 	sub	r12, r12, #1
   2833 	movi	v10.8b, #0
   2834 
   2835 	mov	q3, q2
   2836 	movi	q9, #0
   2837 	cmp	r5, #32
   2838 
   2839 	mov	q2, q1
   2840 	movi	v11.8b, #0
   2841 	bgt	.L192_enc_blocks_more_than_2
   2842 
   2843 	sub	r12, r12, #1
   2844 
   2845 	mov	q3, q1
   2846 	cmp	r5, #16
   2847 	bgt	.L192_enc_blocks_more_than_1
   2848 
   2849 	sub	r12, r12, #1
   2850 	b	.L192_enc_blocks_less_than_1
   2851 .L192_enc_blocks_more_than_3:@ blocks left >  3
   2852 	st1	{ q5}, [r2], #16                    @ AES final-3 block  - store result
   2853 
   2854 	ldp	r6, r7, [r0], #16          @ AES final-2 block - load input low & high
   2855 #ifdef __ARMEB__
   2856 	rev	r6, r6
   2857 	rev	r7, r7
   2858 #endif
   2859 	rev64	q4, q5                                   @ GHASH final-3 block
   2860 
   2861 	eor	r6, r6, r13                    @ AES final-2 block - round 12 low
   2862 	eor	q4, q4, q8                          @ feed in partial tag
   2863 
   2864 	eor	r7, r7, r14                    @ AES final-2 block - round 12 high
   2865 	fmov	d5, r6                                @ AES final-2 block - mov low
   2866 
   2867 	fmov	v5.d[1], r7                            @ AES final-2 block - mov high
   2868 
   2869 	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
   2870 
   2871 	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
   2872 
   2873 	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
   2874 
   2875 	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
   2876 
   2877 	movi	q8, #0                                       @ suppress further partial tag feed in
   2878 
   2879 	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
   2880 
   2881 	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
   2882 	eor	q5, q5, q1                           @ AES final-2 block - result
   2883 .L192_enc_blocks_more_than_2:@ blocks left >  2
   2884 
   2885 	st1	{ q5}, [r2], #16                    @ AES final-2 block - store result
   2886 
   2887 	rev64	q4, q5                                   @ GHASH final-2 block
   2888 	ldp	r6, r7, [r0], #16          @ AES final-1 block - load input low & high
   2889 #ifdef __ARMEB__
   2890 	rev	r6, r6
   2891 	rev	r7, r7
   2892 #endif
   2893 	eor	q4, q4, q8                          @ feed in partial tag
   2894 
   2895 	eor	r7, r7, r14                    @ AES final-1 block - round 12 high
   2896 
   2897 	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
   2898 	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
   2899 
   2900 	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
   2901 	eor	r6, r6, r13                    @ AES final-1 block - round 12 low
   2902 
   2903 	fmov	d5, r6                                @ AES final-1 block - mov low
   2904 
   2905 	fmov	v5.d[1], r7                            @ AES final-1 block - mov high
   2906 	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
   2907 	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
   2908 
   2909 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
   2910 
   2911 	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
   2912 
   2913 	movi	q8, #0                                       @ suppress further partial tag feed in
   2914 
   2915 	eor	q5, q5, q2                           @ AES final-1 block - result
   2916 
   2917 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
   2918 .L192_enc_blocks_more_than_1:@ blocks left >  1
   2919 
   2920 	st1	{ q5}, [r2], #16                    @ AES final-1 block - store result
   2921 
   2922 	ldp	r6, r7, [r0], #16          @ AES final block - load input low & high
   2923 #ifdef __ARMEB__
   2924 	rev	r6, r6
   2925 	rev	r7, r7
   2926 #endif
   2927 	rev64	q4, q5                                   @ GHASH final-1 block
   2928 
   2929 	eor	r6, r6, r13                    @ AES final block - round 12 low
   2930 	eor	q4, q4, q8                          @ feed in partial tag
   2931 	movi	q8, #0                                       @ suppress further partial tag feed in
   2932 
   2933 	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
   2934 
   2935 	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
   2936 	eor	r7, r7, r14                    @ AES final block - round 12 high
   2937 	fmov	d5, r6                                @ AES final block - mov low
   2938 
   2939 	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
   2940 	fmov	v5.d[1], r7                            @ AES final block - mov high
   2941 
   2942 	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
   2943 
   2944 	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
   2945 
   2946 	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
   2947 
   2948 	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
   2949 
   2950 	eor	q5, q5, q3                           @ AES final block - result
   2951 
   2952 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
   2953 
   2954 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
   2955 .L192_enc_blocks_less_than_1:@ blocks left <= 1
   2956 
   2957 	ld1	{ v18.16b}, [r2]                           @ load existing bytes where the possibly partial last block is to be stored
   2958 #ifndef __ARMEB__
   2959 	rev	r9, r12
   2960 #else
   2961 	mov	r9, r12
   2962 #endif
   2963 	and	r1, r1, #127                   @ bit_length %= 128
   2964 
   2965 	sub	r1, r1, #128                   @ bit_length -= 128
   2966 	mvn	r14, xzr                                     @ rk12_h = 0xffffffffffffffff
   2967 
   2968 	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
   2969 	mvn	r13, xzr                                     @ rk12_l = 0xffffffffffffffff
   2970 
   2971 	and	r1, r1, #127                   @ bit_length %= 128
   2972 
   2973 	lsr	r14, r14, r1                    @ rk12_h is mask for top 64b of last block
   2974 	cmp	r1, #64
   2975 
   2976 	csel	r6, r13, r14, lt
   2977 	csel	r7, r14, xzr, lt
   2978 
   2979 	fmov	d0, r6                                @ ctr0b is mask for last block
   2980 
   2981 	fmov	v0.d[1], r7
   2982 
   2983 	and	q5, q5, q0                           @ possibly partial last block has zeroes in highest bits
   2984 
   2985 	rev64	q4, q5                                   @ GHASH final block
   2986 
   2987 	eor	q4, q4, q8                          @ feed in partial tag
   2988 
   2989 	mov	d8, v4.d[1]                                 @ GHASH final block - mid
   2990 
   2991 	pmull	v21.1q, q4, v12.1d                         @ GHASH final block - low
   2992 
   2993 	pmull2	v20.1q, q4, v12.2d                         @ GHASH final block - high
   2994 
   2995 	eor	q8, q8, q4                         @ GHASH final block - mid
   2996 
   2997 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final block - low
   2998 
   2999 	eor	q9, q9, v20.16b                           @ GHASH final block - high
   3000 
   3001 	pmull	v8.1q, q8, v16.1d                         @ GHASH final block - mid
   3002 
   3003 	eor	v10.16b, v10.16b, q8                        @ GHASH final block - mid
   3004 	movi	q8, #0xc2
   3005 
   3006 	eor	v30.16b, v11.16b, q9                        @ MODULO - karatsuba tidy up
   3007 
   3008 	shl	d8, d8, #56              @ mod_constant
   3009 
   3010 	bif	q5, v18.16b, q0                             @ insert existing bytes in top end of result before storing
   3011 
   3012 	eor	v10.16b, v10.16b, v30.16b                        @ MODULO - karatsuba tidy up
   3013 
   3014 	pmull	v31.1q, q9, q8           @ MODULO - top 64b align with mid
   3015 
   3016 	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
   3017 
   3018 	eor	v10.16b, v10.16b, v31.16b                     @ MODULO - fold into mid
   3019 
   3020 	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
   3021 
   3022 	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
   3023 
   3024 	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
   3025 
   3026 	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
   3027 	str	r9, [r16, #12]                         @ store the updated counter
   3028 
   3029 	st1	{ q5}, [r2]                         @ store all 16B
   3030 
   3031 	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
   3032 	ext	v11.16b, v11.16b, v11.16b, #8
   3033 	rev64	v11.16b, v11.16b
   3034 	mov	r0, r15
   3035 	st1	{ v11.16b }, [r3]
   3036 
   3037 	ldp	r21, r22, [sp, #16]
   3038 	ldp	r23, r24, [sp, #32]
   3039 	ldp	d8, d9, [sp, #48]
   3040 	ldp	d10, d11, [sp, #64]
   3041 	ldp	d12, d13, [sp, #80]
   3042 	ldp	d14, d15, [sp, #96]
   3043 	ldp	r19, r20, [sp], #112
   3044 	RET
   3045 
   3046 .L192_enc_ret:
   3047 	mov	r0, #0x0
   3048 	RET
   3049 .size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
   3050 .globl	aes_gcm_dec_192_kernel
   3051 .type	aes_gcm_dec_192_kernel,%function
   3052 .align	4
   3053 aes_gcm_dec_192_kernel:
   3054 	AARCH64_VALID_CALL_TARGET
   3055 	cbz	r1, .L192_dec_ret
   3056 	stp	r19, r20, [sp, #-112]!
   3057 	mov	r16, r4
   3058 	mov	r8, r5
   3059 	stp	r21, r22, [sp, #16]
   3060 	stp	r23, r24, [sp, #32]
   3061 	stp	d8, d9, [sp, #48]
   3062 	stp	d10, d11, [sp, #64]
   3063 	stp	d12, d13, [sp, #80]
   3064 	stp	d14, d15, [sp, #96]
   3065 
   3066 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   3067 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   3068 #ifdef __ARMEB__
   3069 	rev	r10, r10
   3070 	rev	r11, r11
   3071 #endif
   3072 	ldp	r13, r14, [r8, #192]                     @ load rk12
   3073 #ifdef __ARMEB__
   3074 	ror	r13, r13, #32
   3075 	ror	r14, r14, #32
   3076 #endif
   3077 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   3078 
   3079 	ld1	{v18.4s}, [r8], #16                                  @ load rk0
   3080 
   3081 	lsr	r5, r1, #3              @ byte_len
   3082 	mov	r15, r5
   3083 	ld1	{v19.4s}, [r8], #16                               @ load rk1
   3084 
   3085 	lsr	r12, r11, #32
   3086 	orr	r11, r11, r11
   3087 	fmov	d3, r10                               @ CTR block 3
   3088 
   3089 	rev	r12, r12                                @ rev_ctr32
   3090 	fmov	d1, r10                               @ CTR block 1
   3091 
   3092 	add	r12, r12, #1                            @ increment rev_ctr32
   3093 	ld1	{v20.4s}, [r8], #16                               @ load rk2
   3094 
   3095 	aese	q0, v18.16b
   3096 	aesmc	q0, q0          @ AES block 0 - round 0
   3097 	rev	r9, r12                                 @ CTR block 1
   3098 
   3099 	add	r12, r12, #1                            @ CTR block 1
   3100 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   3101 	ld1	{v21.4s}, [r8], #16                               @ load rk3
   3102 
   3103 	fmov	v1.d[1], r9                               @ CTR block 1
   3104 	rev	r9, r12                                 @ CTR block 2
   3105 	add	r12, r12, #1                            @ CTR block 2
   3106 
   3107 	fmov	d2, r10                               @ CTR block 2
   3108 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   3109 
   3110 	fmov	v2.d[1], r9                               @ CTR block 2
   3111 	rev	r9, r12                                 @ CTR block 3
   3112 
   3113 	aese	q0, v19.16b
   3114 	aesmc	q0, q0          @ AES block 0 - round 1
   3115 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   3116 
   3117 	fmov	v3.d[1], r9                               @ CTR block 3
   3118 
   3119 	ld1	{v22.4s}, [r8], #16                               @ load rk4
   3120 
   3121 	aese	q0, v20.16b
   3122 	aesmc	q0, q0          @ AES block 0 - round 2
   3123 
   3124 	aese	q2, v18.16b
   3125 	aesmc	q2, q2          @ AES block 2 - round 0
   3126 	ld1	{v23.4s}, [r8], #16                               @ load rk5
   3127 
   3128 	aese	q1, v18.16b
   3129 	aesmc	q1, q1          @ AES block 1 - round 0
   3130 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   3131 #ifndef __ARMEB__
   3132 	ext	v15.16b, v15.16b, v15.16b, #8
   3133 #endif
   3134 	aese	q3, v18.16b
   3135 	aesmc	q3, q3          @ AES block 3 - round 0
   3136 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   3137 #ifndef __ARMEB__
   3138 	ext	v13.16b, v13.16b, v13.16b, #8
   3139 #endif
   3140 	aese	q2, v19.16b
   3141 	aesmc	q2, q2          @ AES block 2 - round 1
   3142 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   3143 #ifndef __ARMEB__
   3144 	ext	v14.16b, v14.16b, v14.16b, #8
   3145 #endif
   3146 	aese	q1, v19.16b
   3147 	aesmc	q1, q1          @ AES block 1 - round 1
   3148 
   3149 	aese	q3, v19.16b
   3150 	aesmc	q3, q3          @ AES block 3 - round 1
   3151 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   3152 #ifndef __ARMEB__
   3153 	ext	v12.16b, v12.16b, v12.16b, #8
   3154 #endif
   3155 	aese	q2, v20.16b
   3156 	aesmc	q2, q2          @ AES block 2 - round 2
   3157 	ld1	{v24.4s}, [r8], #16                               @ load rk6
   3158 
   3159 	aese	q0, v21.16b
   3160 	aesmc	q0, q0          @ AES block 0 - round 3
   3161 	ld1	{v25.4s}, [r8], #16                               @ load rk7
   3162 
   3163 	aese	q1, v20.16b
   3164 	aesmc	q1, q1          @ AES block 1 - round 2
   3165 	ld1	{v26.4s}, [r8], #16                               @ load rk8
   3166 
   3167 	aese	q3, v20.16b
   3168 	aesmc	q3, q3          @ AES block 3 - round 2
   3169 	ld1	{v27.4s}, [r8], #16                               @ load rk9
   3170 
   3171 	aese	q2, v21.16b
   3172 	aesmc	q2, q2          @ AES block 2 - round 3
   3173 	ld1	{ v11.16b}, [r3]
   3174 	ext	v11.16b, v11.16b, v11.16b, #8
   3175 	rev64	v11.16b, v11.16b
   3176 
   3177 	aese	q1, v21.16b
   3178 	aesmc	q1, q1          @ AES block 1 - round 3
   3179 	add	r12, r12, #1                            @ CTR block 3
   3180 
   3181 	aese	q3, v21.16b
   3182 	aesmc	q3, q3          @ AES block 3 - round 3
   3183 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   3184 
   3185 	aese	q0, v22.16b
   3186 	aesmc	q0, q0          @ AES block 0 - round 4
   3187 	ld1	{v28.4s}, [r8], #16                              @ load rk10
   3188 
   3189 	aese	q1, v22.16b
   3190 	aesmc	q1, q1          @ AES block 1 - round 4
   3191 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   3192 
   3193 	aese	q2, v22.16b
   3194 	aesmc	q2, q2          @ AES block 2 - round 4
   3195 
   3196 	aese	q3, v22.16b
   3197 	aesmc	q3, q3          @ AES block 3 - round 4
   3198 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   3199 
   3200 	aese	q0, v23.16b
   3201 	aesmc	q0, q0          @ AES block 0 - round 5
   3202 	ld1	{v29.4s}, [r8], #16                              @ load rk11
   3203 
   3204 	aese	q1, v23.16b
   3205 	aesmc	q1, q1          @ AES block 1 - round 5
   3206 
   3207 	aese	q2, v23.16b
   3208 	aesmc	q2, q2          @ AES block 2 - round 5
   3209 
   3210 	aese	q3, v23.16b
   3211 	aesmc	q3, q3          @ AES block 3 - round 5
   3212 
   3213 	aese	q0, v24.16b
   3214 	aesmc	q0, q0          @ AES block 0 - round 6
   3215 
   3216 	aese	q2, v24.16b
   3217 	aesmc	q2, q2          @ AES block 2 - round 6
   3218 
   3219 	aese	q3, v24.16b
   3220 	aesmc	q3, q3          @ AES block 3 - round 6
   3221 
   3222 	aese	q0, v25.16b
   3223 	aesmc	q0, q0          @ AES block 0 - round 7
   3224 
   3225 	aese	q2, v25.16b
   3226 	aesmc	q2, q2          @ AES block 2 - round 7
   3227 
   3228 	aese	q3, v25.16b
   3229 	aesmc	q3, q3          @ AES block 3 - round 7
   3230 
   3231 	aese	q1, v24.16b
   3232 	aesmc	q1, q1          @ AES block 1 - round 6
   3233 
   3234 	aese	q2, v26.16b
   3235 	aesmc	q2, q2          @ AES block 2 - round 8
   3236 
   3237 	aese	q3, v26.16b
   3238 	aesmc	q3, q3          @ AES block 3 - round 8
   3239 
   3240 	aese	q1, v25.16b
   3241 	aesmc	q1, q1          @ AES block 1 - round 7
   3242 
   3243 	aese	q2, v27.16b
   3244 	aesmc	q2, q2          @ AES block 2 - round 9
   3245 
   3246 	aese	q3, v27.16b
   3247 	aesmc	q3, q3          @ AES block 3 - round 9
   3248 
   3249 	aese	q1, v26.16b
   3250 	aesmc	q1, q1          @ AES block 1 - round 8
   3251 	sub	r5, r5, #1      @ byte_len - 1
   3252 
   3253 	aese	q0, v26.16b
   3254 	aesmc	q0, q0          @ AES block 0 - round 8
   3255 	and	r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   3256 
   3257 	aese	q3, v28.16b
   3258 	aesmc	q3, q3          @ AES block 3 - round 10
   3259 	add	r5, r5, r0
   3260 
   3261 	aese	q1, v27.16b
   3262 	aesmc	q1, q1          @ AES block 1 - round 9
   3263 	cmp	r0, r5                   @ check if we have <= 4 blocks
   3264 
   3265 	aese	q0, v27.16b
   3266 	aesmc	q0, q0          @ AES block 0 - round 9
   3267 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   3268 
   3269 	aese	q3, v29.16b                                     @ AES block 3 - round 11
   3270 
   3271 	aese	q2, v28.16b
   3272 	aesmc	q2, q2          @ AES block 2 - round 10
   3273 
   3274 	aese	q1, v28.16b
   3275 	aesmc	q1, q1          @ AES block 1 - round 10
   3276 
   3277 	aese	q0, v28.16b
   3278 	aesmc	q0, q0          @ AES block 0 - round 10
   3279 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   3280 
   3281 	aese	q2, v29.16b                                     @ AES block 2 - round 11
   3282 
   3283 	aese	q1, v29.16b                                     @ AES block 1 - round 11
   3284 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   3285 
   3286 	aese	q0, v29.16b                                     @ AES block 0 - round 11
   3287 	bge	.L192_dec_tail                                    @ handle tail
   3288 
   3289 	ld1	{q4, q5}, [r0], #32               @ AES block 0,1 - load ciphertext
   3290 
   3291 	eor	q1, q5, q1                            @ AES block 1 - result
   3292 
   3293 	eor	q0, q4, q0                            @ AES block 0 - result
   3294 	rev	r9, r12                                 @ CTR block 4
   3295 	ld1	{q6, q7}, [r0], #32               @ AES block 2,3 - load ciphertext
   3296 
   3297 	mov	r19, v1.d[0]                            @ AES block 1 - mov low
   3298 
   3299 	mov	r20, v1.d[1]                            @ AES block 1 - mov high
   3300 
   3301 	mov	r6, v0.d[0]                            @ AES block 0 - mov low
   3302 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   3303 	add	r12, r12, #1                            @ CTR block 4
   3304 
   3305 	mov	r7, v0.d[1]                            @ AES block 0 - mov high
   3306 	rev64	q4, q4                                    @ GHASH block 0
   3307 
   3308 	fmov	d0, r10                               @ CTR block 4
   3309 	rev64	q5, q5                                    @ GHASH block 1
   3310 	cmp	r0, r5                   @ check if we have <= 8 blocks
   3311 
   3312 	eor	r19, r19, r13                   @ AES block 1 - round 12 low
   3313 #ifdef __ARMEB__
   3314 	rev	r19, r19
   3315 #endif
   3316 	fmov	v0.d[1], r9                               @ CTR block 4
   3317 	rev	r9, r12                                 @ CTR block 5
   3318 
   3319 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   3320 	fmov	d1, r10                               @ CTR block 5
   3321 	eor	r20, r20, r14                   @ AES block 1 - round 12 high
   3322 #ifdef __ARMEB__
   3323 	rev	r20, r20
   3324 #endif
   3325 	add	r12, r12, #1                            @ CTR block 5
   3326 	fmov	v1.d[1], r9                               @ CTR block 5
   3327 	eor	r6, r6, r13                   @ AES block 0 - round 12 low
   3328 #ifdef __ARMEB__
   3329 	rev	r6, r6
   3330 #endif
   3331 	rev	r9, r12                                 @ CTR block 6
   3332 	eor	r7, r7, r14                   @ AES block 0 - round 12 high
   3333 #ifdef __ARMEB__
   3334 	rev	r7, r7
   3335 #endif
   3336 	stp	r6, r7, [r2], #16        @ AES block 0 - store result
   3337 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   3338 
   3339 	stp	r19, r20, [r2], #16        @ AES block 1 - store result
   3340 
   3341 	add	r12, r12, #1                            @ CTR block 6
   3342 	eor	q2, q6, q2                            @ AES block 2 - result
   3343 	bge	.L192_dec_prepretail                              @ do prepretail
   3344 
   3345 .L192_dec_main_loop:@ main loop start
   3346 	aese	q1, v18.16b
   3347 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   3348 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   3349 
   3350 	pmull	v31.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   3351 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   3352 
   3353 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   3354 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   3355 	rev64	q7, q7                                    @ GHASH block 4k+3
   3356 
   3357 	aese	q1, v19.16b
   3358 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   3359 	fmov	d2, r10                               @ CTR block 4k+6
   3360 
   3361 	aese	q0, v18.16b
   3362 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   3363 	eor	q4, q4, v11.16b                           @ PRE 1
   3364 
   3365 	pmull2	v30.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   3366 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   3367 
   3368 	aese	q1, v20.16b
   3369 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   3370 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   3371 
   3372 	aese	q0, v19.16b
   3373 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   3374 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   3375 
   3376 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   3377 	fmov	d3, r10                               @ CTR block 4k+7
   3378 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   3379 
   3380 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   3381 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   3382 	rev	r9, r12                                 @ CTR block 4k+7
   3383 
   3384 	aese	q2, v18.16b
   3385 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   3386 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   3387 
   3388 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   3389 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   3390 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   3391 
   3392 	aese	q1, v21.16b
   3393 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   3394 
   3395 	aese	q0, v20.16b
   3396 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   3397 	eor	r22, r22, r14                   @ AES block 4k+2 - round 12 high
   3398 #ifdef __ARMEB__
   3399 	rev	r22, r22
   3400 #endif
   3401 	aese	q2, v19.16b
   3402 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   3403 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   3404 
   3405 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   3406 
   3407 	aese	q3, v18.16b
   3408 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   3409 	rev64	q6, q6                                    @ GHASH block 4k+2
   3410 
   3411 	aese	q2, v20.16b
   3412 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   3413 
   3414 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   3415 	eor	v11.16b, v11.16b, v31.16b                         @ GHASH block 4k+1 - low
   3416 	eor	r21, r21, r13                   @ AES block 4k+2 - round 12 low
   3417 #ifdef __ARMEB__
   3418 	rev	r21, r21
   3419 #endif
   3420 	aese	q1, v22.16b
   3421 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   3422 
   3423 	aese	q0, v21.16b
   3424 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   3425 
   3426 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   3427 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   3428 
   3429 	aese	q3, v19.16b
   3430 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   3431 	eor	q9, q9, v30.16b                         @ GHASH block 4k+1 - high
   3432 
   3433 	aese	q0, v22.16b
   3434 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   3435 
   3436 	pmull2	v30.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   3437 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   3438 
   3439 	pmull	v8.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   3440 
   3441 	aese	q0, v23.16b
   3442 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   3443 
   3444 	eor	q9, q9, v30.16b                         @ GHASH block 4k+2 - high
   3445 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   3446 
   3447 	aese	q1, v23.16b
   3448 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   3449 
   3450 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   3451 
   3452 	aese	q3, v20.16b
   3453 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   3454 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   3455 
   3456 	aese	q1, v24.16b
   3457 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   3458 
   3459 	aese	q0, v24.16b
   3460 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   3461 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   3462 
   3463 	aese	q3, v21.16b
   3464 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   3465 
   3466 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   3467 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+2 - low
   3468 
   3469 	aese	q0, v25.16b
   3470 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   3471 
   3472 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   3473 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   3474 
   3475 	aese	q1, v25.16b
   3476 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   3477 
   3478 	aese	q0, v26.16b
   3479 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   3480 	movi	q8, #0xc2
   3481 
   3482 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   3483 
   3484 	aese	q1, v26.16b
   3485 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   3486 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   3487 
   3488 	aese	q2, v21.16b
   3489 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   3490 
   3491 	aese	q0, v27.16b
   3492 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   3493 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   3494 
   3495 	aese	q3, v22.16b
   3496 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   3497 
   3498 	aese	q2, v22.16b
   3499 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   3500 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   3501 
   3502 	aese	q0, v28.16b
   3503 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   3504 
   3505 	aese	q1, v27.16b
   3506 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   3507 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   3508 
   3509 	aese	q2, v23.16b
   3510 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   3511 
   3512 	aese	q3, v23.16b
   3513 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   3514 	shl	d8, d8, #56               @ mod_constant
   3515 
   3516 	aese	q1, v28.16b
   3517 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   3518 
   3519 	aese	q2, v24.16b
   3520 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   3521 	ld1	{q4}, [r0], #16                       @ AES block 4k+4 - load ciphertext
   3522 
   3523 	aese	q3, v24.16b
   3524 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   3525 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   3526 
   3527 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   3528 	ld1	{q5}, [r0], #16                       @ AES block 4k+5 - load ciphertext
   3529 	eor	r23, r23, r13                   @ AES block 4k+3 - round 12 low
   3530 #ifdef __ARMEB__
   3531 	rev	r23, r23
   3532 #endif
   3533 	aese	q2, v25.16b
   3534 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   3535 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   3536 
   3537 	aese	q0, v29.16b                                     @ AES block 4k+4 - round 11
   3538 	add	r12, r12, #1                            @ CTR block 4k+7
   3539 
   3540 	aese	q3, v25.16b
   3541 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   3542 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   3543 
   3544 	aese	q2, v26.16b
   3545 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   3546 	ld1	{q6}, [r0], #16                       @ AES block 4k+6 - load ciphertext
   3547 
   3548 	aese	q1, v29.16b                                     @ AES block 4k+5 - round 11
   3549 	ld1	{q7}, [r0], #16                       @ AES block 4k+7 - load ciphertext
   3550 	rev	r9, r12                                 @ CTR block 4k+8
   3551 
   3552 	aese	q3, v26.16b
   3553 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   3554 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   3555 
   3556 	aese	q2, v27.16b
   3557 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   3558 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   3559 
   3560 	cmp	r0, r5                   @ .LOOP CONTROL
   3561 
   3562 	eor	q0, q4, q0                            @ AES block 4k+4 - result
   3563 	eor	r24, r24, r14                   @ AES block 4k+3 - round 12 high
   3564 #ifdef __ARMEB__
   3565 	rev	r24, r24
   3566 #endif
   3567 	eor	q1, q5, q1                            @ AES block 4k+5 - result
   3568 
   3569 	aese	q2, v28.16b
   3570 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   3571 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   3572 
   3573 	aese	q3, v27.16b
   3574 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   3575 
   3576 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   3577 	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
   3578 
   3579 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   3580 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   3581 	rev64	q5, q5                                    @ GHASH block 4k+5
   3582 
   3583 	aese	q2, v29.16b                                     @ AES block 4k+6 - round 11
   3584 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   3585 
   3586 	aese	q3, v28.16b
   3587 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   3588 	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
   3589 
   3590 	fmov	d0, r10                               @ CTR block 4k+8
   3591 	add	r12, r12, #1                            @ CTR block 4k+8
   3592 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   3593 
   3594 	eor	q2, q6, q2                            @ AES block 4k+6 - result
   3595 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   3596 	rev	r9, r12                                 @ CTR block 4k+9
   3597 
   3598 	eor	r6, r6, r13                   @ AES block 4k+4 - round 12 low
   3599 #ifdef __ARMEB__
   3600 	rev	r6, r6
   3601 #endif
   3602 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   3603 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   3604 
   3605 	fmov	d1, r10                               @ CTR block 4k+9
   3606 	add	r12, r12, #1                            @ CTR block 4k+9
   3607 	eor	r19, r19, r13                   @ AES block 4k+5 - round 12 low
   3608 #ifdef __ARMEB__
   3609 	rev	r19, r19
   3610 #endif
   3611 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   3612 	rev	r9, r12                                 @ CTR block 4k+10
   3613 	eor	r20, r20, r14                   @ AES block 4k+5 - round 12 high
   3614 #ifdef __ARMEB__
   3615 	rev	r20, r20
   3616 #endif
   3617 	eor	r7, r7, r14                   @ AES block 4k+4 - round 12 high
   3618 #ifdef __ARMEB__
   3619 	rev	r7, r7
   3620 #endif
   3621 	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
   3622 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   3623 
   3624 	add	r12, r12, #1                            @ CTR block 4k+10
   3625 	rev64	q4, q4                                    @ GHASH block 4k+4
   3626 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   3627 
   3628 	aese	q3, v29.16b                                     @ AES block 4k+7 - round 11
   3629 	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
   3630 	blt	.L192_dec_main_loop
   3631 
   3632 .L192_dec_prepretail:@ PREPRETAIL
   3633 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   3634 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   3635 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   3636 
   3637 	aese	q1, v18.16b
   3638 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   3639 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   3640 
   3641 	aese	q0, v18.16b
   3642 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   3643 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   3644 
   3645 	eor	q4, q4, v11.16b                           @ PRE 1
   3646 	fmov	d2, r10                               @ CTR block 4k+6
   3647 
   3648 	aese	q1, v19.16b
   3649 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   3650 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   3651 
   3652 	aese	q0, v19.16b
   3653 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   3654 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   3655 
   3656 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   3657 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   3658 	fmov	d3, r10                               @ CTR block 4k+7
   3659 
   3660 	aese	q1, v20.16b
   3661 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   3662 	rev64	q6, q6                                    @ GHASH block 4k+2
   3663 
   3664 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   3665 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   3666 	rev	r9, r12                                 @ CTR block 4k+7
   3667 
   3668 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   3669 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   3670 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   3671 
   3672 	pmull	v31.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   3673 	eor	r24, r24, r14                   @ AES block 4k+3 - round 12 high
   3674 #ifdef __ARMEB__
   3675 	rev	r24, r24
   3676 #endif
   3677 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   3678 
   3679 	aese	q0, v20.16b
   3680 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   3681 	eor	r21, r21, r13                   @ AES block 4k+2 - round 12 low
   3682 #ifdef __ARMEB__
   3683 	rev	r21, r21
   3684 #endif
   3685 	pmull2	v30.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   3686 	eor	r22, r22, r14                   @ AES block 4k+2 - round 12 high
   3687 #ifdef __ARMEB__
   3688 	rev	r22, r22
   3689 #endif
   3690 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   3691 
   3692 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   3693 	eor	r23, r23, r13                   @ AES block 4k+3 - round 12 low
   3694 #ifdef __ARMEB__
   3695 	rev	r23, r23
   3696 #endif
   3697 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   3698 
   3699 	rev64	q7, q7                                    @ GHASH block 4k+3
   3700 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   3701 
   3702 	aese	q3, v18.16b
   3703 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   3704 	eor	q9, q9, v30.16b                         @ GHASH block 4k+1 - high
   3705 
   3706 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   3707 	add	r12, r12, #1                            @ CTR block 4k+7
   3708 
   3709 	pmull2	v30.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   3710 	eor	v11.16b, v11.16b, v31.16b                         @ GHASH block 4k+1 - low
   3711 
   3712 	aese	q2, v18.16b
   3713 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   3714 
   3715 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   3716 	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
   3717 
   3718 	aese	q3, v19.16b
   3719 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   3720 
   3721 	aese	q2, v19.16b
   3722 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   3723 	eor	q9, q9, v30.16b                         @ GHASH block 4k+2 - high
   3724 
   3725 	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
   3726 
   3727 	pmull	v8.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   3728 
   3729 	aese	q2, v20.16b
   3730 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   3731 	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
   3732 
   3733 	aese	q3, v20.16b
   3734 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   3735 	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
   3736 
   3737 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   3738 
   3739 	aese	q0, v21.16b
   3740 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   3741 	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
   3742 
   3743 	aese	q1, v21.16b
   3744 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   3745 
   3746 	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
   3747 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+2 - low
   3748 
   3749 	aese	q0, v22.16b
   3750 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   3751 
   3752 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   3753 	movi	q8, #0xc2
   3754 
   3755 	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
   3756 
   3757 	aese	q2, v21.16b
   3758 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   3759 
   3760 	shl	d8, d8, #56               @ mod_constant
   3761 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   3762 
   3763 	aese	q0, v23.16b
   3764 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   3765 	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
   3766 
   3767 	aese	q2, v22.16b
   3768 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   3769 
   3770 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   3771 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   3772 
   3773 	aese	q0, v24.16b
   3774 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   3775 
   3776 	aese	q3, v21.16b
   3777 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   3778 	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
   3779 
   3780 	aese	q2, v23.16b
   3781 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   3782 
   3783 	aese	q0, v25.16b
   3784 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   3785 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   3786 
   3787 	aese	q3, v22.16b
   3788 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   3789 
   3790 	aese	q2, v24.16b
   3791 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   3792 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   3793 
   3794 	aese	q0, v26.16b
   3795 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   3796 
   3797 	aese	q3, v23.16b
   3798 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   3799 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   3800 
   3801 	aese	q1, v22.16b
   3802 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   3803 
   3804 	aese	q2, v25.16b
   3805 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   3806 
   3807 	aese	q0, v27.16b
   3808 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   3809 
   3810 	aese	q1, v23.16b
   3811 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   3812 
   3813 	aese	q3, v24.16b
   3814 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   3815 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   3816 
   3817 	aese	q0, v28.16b
   3818 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   3819 
   3820 	aese	q1, v24.16b
   3821 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   3822 
   3823 	aese	q3, v25.16b
   3824 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   3825 
   3826 	aese	q2, v26.16b
   3827 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   3828 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   3829 
   3830 	aese	q1, v25.16b
   3831 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   3832 
   3833 	aese	q3, v26.16b
   3834 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   3835 
   3836 	aese	q2, v27.16b
   3837 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   3838 
   3839 	aese	q1, v26.16b
   3840 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   3841 
   3842 	aese	q3, v27.16b
   3843 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   3844 
   3845 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   3846 
   3847 	aese	q1, v27.16b
   3848 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   3849 
   3850 	aese	q2, v28.16b
   3851 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   3852 
   3853 	aese	q3, v28.16b
   3854 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   3855 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   3856 
   3857 	aese	q1, v28.16b
   3858 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   3859 
   3860 	aese	q0, v29.16b
   3861 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   3862 
   3863 	aese	q2, v29.16b
   3864 
   3865 	aese	q1, v29.16b
   3866 
   3867 	aese	q3, v29.16b
   3868 
   3869 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   3870 .L192_dec_tail:@ TAIL
   3871 
   3872 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   3873 	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
   3874 
   3875 	eor	q0, q5, q0                            @ AES block 4k+4 - result
   3876 
   3877 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   3878 
   3879 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   3880 
   3881 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   3882 
   3883 	cmp	r5, #48
   3884 
   3885 	eor	r7, r7, r14                   @ AES block 4k+4 - round 12 high
   3886 #ifdef __ARMEB__
   3887 	rev	r7, r7
   3888 #endif
   3889 	eor	r6, r6, r13                   @ AES block 4k+4 - round 12 low
   3890 #ifdef __ARMEB__
   3891 	rev	r6, r6
   3892 #endif
   3893 	bgt	.L192_dec_blocks_more_than_3
   3894 
   3895 	movi	v11.8b, #0
   3896 	movi	q9, #0
   3897 
   3898 	mov	q3, q2
   3899 	mov	q2, q1
   3900 	sub	r12, r12, #1
   3901 
   3902 	movi	v10.8b, #0
   3903 	cmp	r5, #32
   3904 	bgt	.L192_dec_blocks_more_than_2
   3905 
   3906 	mov	q3, q1
   3907 	cmp	r5, #16
   3908 	sub	r12, r12, #1
   3909 
   3910 	bgt	.L192_dec_blocks_more_than_1
   3911 
   3912 	sub	r12, r12, #1
   3913 	b	.L192_dec_blocks_less_than_1
   3914 .L192_dec_blocks_more_than_3:@ blocks left >  3
   3915 	rev64	q4, q5                                    @ GHASH final-3 block
   3916 	ld1	{ q5}, [r0], #16                      @ AES final-2 block - load ciphertext
   3917 
   3918 	stp	r6, r7, [r2], #16        @ AES final-3 block  - store result
   3919 
   3920 	eor	q4, q4, q8                           @ feed in partial tag
   3921 
   3922 	eor	q0, q5, q1                            @ AES final-2 block - result
   3923 
   3924 	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
   3925 	mov	r6, v0.d[0]                            @ AES final-2 block - mov low
   3926 	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
   3927 
   3928 	mov	r7, v0.d[1]                            @ AES final-2 block - mov high
   3929 
   3930 	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
   3931 	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
   3932 
   3933 	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
   3934 
   3935 	eor	r6, r6, r13                   @ AES final-2 block - round 12 low
   3936 #ifdef __ARMEB__
   3937 	rev	r6, r6
   3938 #endif
   3939 	movi	q8, #0                                        @ suppress further partial tag feed in
   3940 
   3941 	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
   3942 	eor	r7, r7, r14                   @ AES final-2 block - round 12 high
   3943 #ifdef __ARMEB__
   3944 	rev	r7, r7
   3945 #endif
   3946 .L192_dec_blocks_more_than_2:@ blocks left >  2
   3947 
   3948 	rev64	q4, q5                                    @ GHASH final-2 block
   3949 	ld1	{ q5}, [r0], #16                      @ AES final-1 block - load ciphertext
   3950 
   3951 	eor	q4, q4, q8                           @ feed in partial tag
   3952 
   3953 	movi	q8, #0                                        @ suppress further partial tag feed in
   3954 
   3955 	eor	q0, q5, q2                            @ AES final-1 block - result
   3956 
   3957 	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
   3958 
   3959 	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
   3960 
   3961 	stp	r6, r7, [r2], #16        @ AES final-2 block  - store result
   3962 
   3963 	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
   3964 	mov	r7, v0.d[1]                            @ AES final-1 block - mov high
   3965 
   3966 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
   3967 	mov	r6, v0.d[0]                            @ AES final-1 block - mov low
   3968 
   3969 	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
   3970 
   3971 	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
   3972 
   3973 	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
   3974 	eor	r7, r7, r14                   @ AES final-1 block - round 12 high
   3975 #ifdef __ARMEB__
   3976 	rev	r7, r7
   3977 #endif
   3978 	eor	r6, r6, r13                   @ AES final-1 block - round 12 low
   3979 #ifdef __ARMEB__
   3980 	rev	r6, r6
   3981 #endif
   3982 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
   3983 .L192_dec_blocks_more_than_1:@ blocks left >  1
   3984 
   3985 	rev64	q4, q5                                    @ GHASH final-1 block
   3986 
   3987 	eor	q4, q4, q8                           @ feed in partial tag
   3988 	ld1	{ q5}, [r0], #16                      @ AES final block - load ciphertext
   3989 
   3990 	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
   3991 
   3992 	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
   3993 
   3994 	eor	q0, q5, q3                            @ AES final block - result
   3995 	stp	r6, r7, [r2], #16        @ AES final-1 block  - store result
   3996 
   3997 	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
   3998 
   3999 	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
   4000 
   4001 	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
   4002 	mov	r7, v0.d[1]                            @ AES final block - mov high
   4003 
   4004 	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
   4005 	mov	r6, v0.d[0]                            @ AES final block - mov low
   4006 
   4007 	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
   4008 
   4009 	movi	q8, #0                                        @ suppress further partial tag feed in
   4010 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
   4011 	eor	r7, r7, r14                   @ AES final block - round 12 high
   4012 #ifdef __ARMEB__
   4013 	rev	r7, r7
   4014 #endif
   4015 	eor	r6, r6, r13                   @ AES final block - round 12 low
   4016 #ifdef __ARMEB__
   4017 	rev	r6, r6
   4018 #endif
   4019 	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
   4020 .L192_dec_blocks_less_than_1:@ blocks left <= 1
   4021 
   4022 	mvn	r13, xzr                                      @ rk12_l = 0xffffffffffffffff
   4023 	ldp	r4, r5, [r2]  @ load existing bytes we need to not overwrite
   4024 	and	r1, r1, #127                    @ bit_length %= 128
   4025 
   4026 	sub	r1, r1, #128                    @ bit_length -= 128
   4027 
   4028 	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
   4029 
   4030 	and	r1, r1, #127                    @ bit_length %= 128
   4031 	mvn	r14, xzr                                      @ rk12_h = 0xffffffffffffffff
   4032 
   4033 	lsr	r14, r14, r1                     @ rk12_h is mask for top 64b of last block
   4034 	cmp	r1, #64
   4035 
   4036 	csel	r9, r13, r14, lt
   4037 	csel	r10, r14, xzr, lt
   4038 
   4039 	fmov	d0, r9                                   @ ctr0b is mask for last block
   4040 	and	r6, r6, r9
   4041 	bic	r4, r4, r9           @ mask out low existing bytes
   4042 
   4043 	orr	r6, r6, r4
   4044 	mov	v0.d[1], r10
   4045 #ifndef __ARMEB__
   4046 	rev	r9, r12
   4047 #else
   4048 	mov	r9, r12
   4049 #endif
   4050 
   4051 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
   4052 	str	r9, [r16, #12]                          @ store the updated counter
   4053 
   4054 	rev64	q4, q5                                    @ GHASH final block
   4055 
   4056 	eor	q4, q4, q8                           @ feed in partial tag
   4057 	bic	r5, r5, r10 @ mask out high existing bytes
   4058 
   4059 	and	r7, r7, r10
   4060 
   4061 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
   4062 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
   4063 
   4064 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
   4065 
   4066 	eor	q8, q8, q4                          @ GHASH final block - mid
   4067 
   4068 	eor	q9, q9, v20.16b                            @ GHASH final block - high
   4069 
   4070 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
   4071 
   4072 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
   4073 
   4074 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
   4075 	movi	q8, #0xc2
   4076 
   4077 	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
   4078 
   4079 	shl	d8, d8, #56               @ mod_constant
   4080 
   4081 	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
   4082 
   4083 	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
   4084 	orr	r7, r7, r5
   4085 	stp	r6, r7, [r2]
   4086 
   4087 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   4088 
   4089 	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
   4090 
   4091 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   4092 
   4093 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   4094 
   4095 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   4096 
   4097 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   4098 
   4099 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   4100 	ext	v11.16b, v11.16b, v11.16b, #8
   4101 	rev64	v11.16b, v11.16b
   4102 	mov	r0, r15
   4103 	st1	{ v11.16b }, [r3]
   4104 
   4105 	ldp	r21, r22, [sp, #16]
   4106 	ldp	r23, r24, [sp, #32]
   4107 	ldp	d8, d9, [sp, #48]
   4108 	ldp	d10, d11, [sp, #64]
   4109 	ldp	d12, d13, [sp, #80]
   4110 	ldp	d14, d15, [sp, #96]
   4111 	ldp	r19, r20, [sp], #112
   4112 	RET
   4113 
   4114 .L192_dec_ret:
   4115 	mov	r0, #0x0
   4116 	RET
   4117 .size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
   4118 .globl	aes_gcm_enc_256_kernel
   4119 .type	aes_gcm_enc_256_kernel,%function
   4120 .align	4
   4121 aes_gcm_enc_256_kernel:
   4122 	AARCH64_VALID_CALL_TARGET
   4123 	cbz	r1, .L256_enc_ret
   4124 	stp	r19, r20, [sp, #-112]!
   4125 	mov	r16, r4
   4126 	mov	r8, r5
   4127 	stp	r21, r22, [sp, #16]
   4128 	stp	r23, r24, [sp, #32]
   4129 	stp	d8, d9, [sp, #48]
   4130 	stp	d10, d11, [sp, #64]
   4131 	stp	d12, d13, [sp, #80]
   4132 	stp	d14, d15, [sp, #96]
   4133 
   4134 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   4135 	lsr	r5, r1, #3              @ byte_len
   4136 	mov	r15, r5
   4137 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   4138 #ifdef __ARMEB__
   4139 	rev	r10, r10
   4140 	rev	r11, r11
   4141 #endif
   4142 	ldp	r13, r14, [r8, #224]                     @ load rk14
   4143 #ifdef __ARMEB__
   4144 	ror	r13, r13, #32
   4145 	ror	r14, r14, #32
   4146 #endif
   4147 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   4148 	sub	r5, r5, #1      @ byte_len - 1
   4149 
   4150 	ld1	{v18.4s}, [r8], #16                               @ load rk0
   4151 	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   4152 
   4153 	ld1	{v19.4s}, [r8], #16                               @ load rk1
   4154 	add	r5, r5, r0
   4155 
   4156 	lsr	r12, r11, #32
   4157 	fmov	d2, r10                               @ CTR block 2
   4158 	orr	r11, r11, r11
   4159 
   4160 	rev	r12, r12                                @ rev_ctr32
   4161 	cmp	r0, r5                   @ check if we have <= 4 blocks
   4162 	fmov	d1, r10                               @ CTR block 1
   4163 
   4164 	aese	q0, v18.16b
   4165 	aesmc	q0, q0          @ AES block 0 - round 0
   4166 	add	r12, r12, #1                            @ increment rev_ctr32
   4167 
   4168 	rev	r9, r12                                 @ CTR block 1
   4169 	fmov	d3, r10                               @ CTR block 3
   4170 
   4171 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   4172 	add	r12, r12, #1                            @ CTR block 1
   4173 	ld1	{v20.4s}, [r8], #16                               @ load rk2
   4174 
   4175 	fmov	v1.d[1], r9                               @ CTR block 1
   4176 	rev	r9, r12                                 @ CTR block 2
   4177 	add	r12, r12, #1                            @ CTR block 2
   4178 
   4179 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   4180 	ld1	{v21.4s}, [r8], #16                               @ load rk3
   4181 
   4182 	fmov	v2.d[1], r9                               @ CTR block 2
   4183 	rev	r9, r12                                 @ CTR block 3
   4184 
   4185 	aese	q0, v19.16b
   4186 	aesmc	q0, q0          @ AES block 0 - round 1
   4187 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   4188 
   4189 	fmov	v3.d[1], r9                               @ CTR block 3
   4190 
   4191 	aese	q1, v18.16b
   4192 	aesmc	q1, q1          @ AES block 1 - round 0
   4193 	ld1	{v22.4s}, [r8], #16                               @ load rk4
   4194 
   4195 	aese	q0, v20.16b
   4196 	aesmc	q0, q0          @ AES block 0 - round 2
   4197 	ld1	{v23.4s}, [r8], #16                               @ load rk5
   4198 
   4199 	aese	q2, v18.16b
   4200 	aesmc	q2, q2          @ AES block 2 - round 0
   4201 	ld1	{v24.4s}, [r8], #16                               @ load rk6
   4202 
   4203 	aese	q1, v19.16b
   4204 	aesmc	q1, q1          @ AES block 1 - round 1
   4205 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   4206 #ifndef __ARMEB__
   4207 	ext	v14.16b, v14.16b, v14.16b, #8
   4208 #endif
   4209 	aese	q3, v18.16b
   4210 	aesmc	q3, q3          @ AES block 3 - round 0
   4211 	ld1	{v25.4s}, [r8], #16                               @ load rk7
   4212 
   4213 	aese	q2, v19.16b
   4214 	aesmc	q2, q2          @ AES block 2 - round 1
   4215 	ld1	{v26.4s}, [r8], #16                               @ load rk8
   4216 
   4217 	aese	q1, v20.16b
   4218 	aesmc	q1, q1          @ AES block 1 - round 2
   4219 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   4220 #ifndef __ARMEB__
   4221 	ext	v13.16b, v13.16b, v13.16b, #8
   4222 #endif
   4223 	aese	q3, v19.16b
   4224 	aesmc	q3, q3          @ AES block 3 - round 1
   4225 	ld1	{v27.4s}, [r8], #16                               @ load rk9
   4226 
   4227 	aese	q2, v20.16b
   4228 	aesmc	q2, q2          @ AES block 2 - round 2
   4229 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   4230 #ifndef __ARMEB__
   4231 	ext	v15.16b, v15.16b, v15.16b, #8
   4232 #endif
   4233 	aese	q1, v21.16b
   4234 	aesmc	q1, q1          @ AES block 1 - round 3
   4235 	ld1	{v28.4s}, [r8], #16                              @ load rk10
   4236 
   4237 	aese	q3, v20.16b
   4238 	aesmc	q3, q3          @ AES block 3 - round 2
   4239 	ld1	{v29.4s}, [r8], #16                              @ load rk11
   4240 
   4241 	aese	q2, v21.16b
   4242 	aesmc	q2, q2          @ AES block 2 - round 3
   4243 	add	r12, r12, #1                            @ CTR block 3
   4244 
   4245 	aese	q0, v21.16b
   4246 	aesmc	q0, q0          @ AES block 0 - round 3
   4247 
   4248 	aese	q3, v21.16b
   4249 	aesmc	q3, q3          @ AES block 3 - round 3
   4250 	ld1	{ v11.16b}, [r3]
   4251 	ext	v11.16b, v11.16b, v11.16b, #8
   4252 	rev64	v11.16b, v11.16b
   4253 
   4254 	aese	q2, v22.16b
   4255 	aesmc	q2, q2          @ AES block 2 - round 4
   4256 
   4257 	aese	q0, v22.16b
   4258 	aesmc	q0, q0          @ AES block 0 - round 4
   4259 
   4260 	aese	q1, v22.16b
   4261 	aesmc	q1, q1          @ AES block 1 - round 4
   4262 
   4263 	aese	q3, v22.16b
   4264 	aesmc	q3, q3          @ AES block 3 - round 4
   4265 
   4266 	aese	q0, v23.16b
   4267 	aesmc	q0, q0          @ AES block 0 - round 5
   4268 
   4269 	aese	q1, v23.16b
   4270 	aesmc	q1, q1          @ AES block 1 - round 5
   4271 
   4272 	aese	q3, v23.16b
   4273 	aesmc	q3, q3          @ AES block 3 - round 5
   4274 
   4275 	aese	q2, v23.16b
   4276 	aesmc	q2, q2          @ AES block 2 - round 5
   4277 
   4278 	aese	q1, v24.16b
   4279 	aesmc	q1, q1          @ AES block 1 - round 6
   4280 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   4281 
   4282 	aese	q3, v24.16b
   4283 	aesmc	q3, q3          @ AES block 3 - round 6
   4284 	ld1	{v30.4s}, [r8], #16                              @ load rk12
   4285 
   4286 	aese	q0, v24.16b
   4287 	aesmc	q0, q0          @ AES block 0 - round 6
   4288 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   4289 #ifndef __ARMEB__
   4290 	ext	v12.16b, v12.16b, v12.16b, #8
   4291 #endif
   4292 	aese	q2, v24.16b
   4293 	aesmc	q2, q2          @ AES block 2 - round 6
   4294 	ld1	{v31.4s}, [r8], #16                              @ load rk13
   4295 
   4296 	aese	q1, v25.16b
   4297 	aesmc	q1, q1          @ AES block 1 - round 7
   4298 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   4299 
   4300 	aese	q0, v25.16b
   4301 	aesmc	q0, q0          @ AES block 0 - round 7
   4302 
   4303 	aese	q2, v25.16b
   4304 	aesmc	q2, q2          @ AES block 2 - round 7
   4305 
   4306 	aese	q3, v25.16b
   4307 	aesmc	q3, q3          @ AES block 3 - round 7
   4308 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   4309 
   4310 	aese	q1, v26.16b
   4311 	aesmc	q1, q1          @ AES block 1 - round 8
   4312 
   4313 	aese	q2, v26.16b
   4314 	aesmc	q2, q2          @ AES block 2 - round 8
   4315 
   4316 	aese	q3, v26.16b
   4317 	aesmc	q3, q3          @ AES block 3 - round 8
   4318 
   4319 	aese	q1, v27.16b
   4320 	aesmc	q1, q1          @ AES block 1 - round 9
   4321 
   4322 	aese	q2, v27.16b
   4323 	aesmc	q2, q2          @ AES block 2 - round 9
   4324 
   4325 	aese	q0, v26.16b
   4326 	aesmc	q0, q0          @ AES block 0 - round 8
   4327 
   4328 	aese	q1, v28.16b
   4329 	aesmc	q1, q1          @ AES block 1 - round 10
   4330 
   4331 	aese	q3, v27.16b
   4332 	aesmc	q3, q3          @ AES block 3 - round 9
   4333 
   4334 	aese	q0, v27.16b
   4335 	aesmc	q0, q0          @ AES block 0 - round 9
   4336 
   4337 	aese	q2, v28.16b
   4338 	aesmc	q2, q2          @ AES block 2 - round 10
   4339 
   4340 	aese	q3, v28.16b
   4341 	aesmc	q3, q3          @ AES block 3 - round 10
   4342 
   4343 	aese	q1, v29.16b
   4344 	aesmc	q1, q1          @ AES block 1 - round 11
   4345 
   4346 	aese	q2, v29.16b
   4347 	aesmc	q2, q2          @ AES block 2 - round 11
   4348 
   4349 	aese	q0, v28.16b
   4350 	aesmc	q0, q0          @ AES block 0 - round 10
   4351 
   4352 	aese	q1, v30.16b
   4353 	aesmc	q1, q1          @ AES block 1 - round 12
   4354 
   4355 	aese	q2, v30.16b
   4356 	aesmc	q2, q2          @ AES block 2 - round 12
   4357 
   4358 	aese	q0, v29.16b
   4359 	aesmc	q0, q0          @ AES block 0 - round 11
   4360 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   4361 
   4362 	aese	q3, v29.16b
   4363 	aesmc	q3, q3          @ AES block 3 - round 11
   4364 
   4365 	aese	q2, v31.16b                                     @ AES block 2 - round 13
   4366 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   4367 
   4368 	aese	q0, v30.16b
   4369 	aesmc	q0, q0          @ AES block 0 - round 12
   4370 
   4371 	aese	q3, v30.16b
   4372 	aesmc	q3, q3          @ AES block 3 - round 12
   4373 
   4374 	aese	q1, v31.16b                                     @ AES block 1 - round 13
   4375 
   4376 	aese	q0, v31.16b                                     @ AES block 0 - round 13
   4377 
   4378 	aese	q3, v31.16b                                     @ AES block 3 - round 13
   4379 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   4380 	bge	.L256_enc_tail                                    @ handle tail
   4381 
   4382 	ldp	r19, r20, [r0, #16]           @ AES block 1 - load plaintext
   4383 #ifdef __ARMEB__
   4384 	rev	r19, r19
   4385 	rev	r20, r20
   4386 #endif
   4387 	rev	r9, r12                                 @ CTR block 4
   4388 	ldp	r6, r7, [r0, #0]            @ AES block 0 - load plaintext
   4389 #ifdef __ARMEB__
   4390 	rev	r6, r6
   4391 	rev	r7, r7
   4392 #endif
   4393 	ldp	r23, r24, [r0, #48]           @ AES block 3 - load plaintext
   4394 #ifdef __ARMEB__
   4395 	rev	r23, r23
   4396 	rev	r24, r24
   4397 #endif
   4398 	ldp	r21, r22, [r0, #32]           @ AES block 2 - load plaintext
   4399 #ifdef __ARMEB__
   4400 	rev	r21, r21
   4401 	rev	r22, r22
   4402 #endif
   4403 	add	r0, r0, #64                       @ AES input_ptr update
   4404 
   4405 	eor	r19, r19, r13                     @ AES block 1 - round 14 low
   4406 	eor	r20, r20, r14                     @ AES block 1 - round 14 high
   4407 
   4408 	fmov	d5, r19                               @ AES block 1 - mov low
   4409 	eor	r6, r6, r13                     @ AES block 0 - round 14 low
   4410 
   4411 	eor	r7, r7, r14                     @ AES block 0 - round 14 high
   4412 	eor	r24, r24, r14                     @ AES block 3 - round 14 high
   4413 	fmov	d4, r6                               @ AES block 0 - mov low
   4414 
   4415 	cmp	r0, r5                   @ check if we have <= 8 blocks
   4416 	fmov	v4.d[1], r7                           @ AES block 0 - mov high
   4417 	eor	r23, r23, r13                     @ AES block 3 - round 14 low
   4418 
   4419 	eor	r21, r21, r13                     @ AES block 2 - round 14 low
   4420 	fmov	v5.d[1], r20                           @ AES block 1 - mov high
   4421 
   4422 	fmov	d6, r21                               @ AES block 2 - mov low
   4423 	add	r12, r12, #1                            @ CTR block 4
   4424 
   4425 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   4426 	fmov	d7, r23                               @ AES block 3 - mov low
   4427 	eor	r22, r22, r14                     @ AES block 2 - round 14 high
   4428 
   4429 	fmov	v6.d[1], r22                           @ AES block 2 - mov high
   4430 
   4431 	eor	q4, q4, q0                          @ AES block 0 - result
   4432 	fmov	d0, r10                               @ CTR block 4
   4433 
   4434 	fmov	v0.d[1], r9                               @ CTR block 4
   4435 	rev	r9, r12                                 @ CTR block 5
   4436 	add	r12, r12, #1                            @ CTR block 5
   4437 
   4438 	eor	q5, q5, q1                          @ AES block 1 - result
   4439 	fmov	d1, r10                               @ CTR block 5
   4440 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   4441 
   4442 	fmov	v1.d[1], r9                               @ CTR block 5
   4443 	rev	r9, r12                                 @ CTR block 6
   4444 	st1	{ q4}, [r2], #16                     @ AES block 0 - store result
   4445 
   4446 	fmov	v7.d[1], r24                           @ AES block 3 - mov high
   4447 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   4448 	eor	q6, q6, q2                          @ AES block 2 - result
   4449 
   4450 	st1	{ q5}, [r2], #16                     @ AES block 1 - store result
   4451 
   4452 	add	r12, r12, #1                            @ CTR block 6
   4453 	fmov	d2, r10                               @ CTR block 6
   4454 
   4455 	fmov	v2.d[1], r9                               @ CTR block 6
   4456 	st1	{ q6}, [r2], #16                     @ AES block 2 - store result
   4457 	rev	r9, r12                                 @ CTR block 7
   4458 
   4459 	orr	r9, r11, r9, lsl #32            @ CTR block 7
   4460 
   4461 	eor	q7, q7, q3                          @ AES block 3 - result
   4462 	st1	{ q7}, [r2], #16                     @ AES block 3 - store result
   4463 	bge	.L256_enc_prepretail                               @ do prepretail
   4464 
   4465 .L256_enc_main_loop:@ main loop start
   4466 	aese	q0, v18.16b
   4467 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   4468 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
   4469 
   4470 	aese	q1, v18.16b
   4471 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   4472 	fmov	d3, r10                               @ CTR block 4k+3
   4473 
   4474 	aese	q2, v18.16b
   4475 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   4476 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   4477 
   4478 	aese	q0, v19.16b
   4479 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   4480 	fmov	v3.d[1], r9                               @ CTR block 4k+3
   4481 
   4482 	aese	q1, v19.16b
   4483 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   4484 	ldp	r23, r24, [r0, #48]           @ AES block 4k+7 - load plaintext
   4485 #ifdef __ARMEB__
   4486 	rev	r23, r23
   4487 	rev	r24, r24
   4488 #endif
   4489 	aese	q2, v19.16b
   4490 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   4491 	ldp	r21, r22, [r0, #32]           @ AES block 4k+6 - load plaintext
   4492 #ifdef __ARMEB__
   4493 	rev	r21, r21
   4494 	rev	r22, r22
   4495 #endif
   4496 	aese	q0, v20.16b
   4497 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   4498 	eor	q4, q4, v11.16b                           @ PRE 1
   4499 
   4500 	aese	q1, v20.16b
   4501 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   4502 
   4503 	aese	q3, v18.16b
   4504 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   4505 	eor	r23, r23, r13                     @ AES block 4k+7 - round 14 low
   4506 
   4507 	aese	q0, v21.16b
   4508 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   4509 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   4510 
   4511 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   4512 	eor	r22, r22, r14                     @ AES block 4k+6 - round 14 high
   4513 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   4514 
   4515 	aese	q3, v19.16b
   4516 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   4517 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
   4518 
   4519 	aese	q0, v22.16b
   4520 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   4521 
   4522 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   4523 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   4524 
   4525 	aese	q2, v20.16b
   4526 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   4527 
   4528 	aese	q0, v23.16b
   4529 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   4530 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4531 
   4532 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   4533 
   4534 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   4535 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
   4536 
   4537 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   4538 
   4539 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   4540 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   4541 
   4542 	aese	q1, v21.16b
   4543 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   4544 
   4545 	aese	q3, v20.16b
   4546 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   4547 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   4548 
   4549 	aese	q2, v21.16b
   4550 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   4551 
   4552 	aese	q1, v22.16b
   4553 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   4554 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   4555 
   4556 	aese	q3, v21.16b
   4557 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   4558 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   4559 
   4560 	aese	q2, v22.16b
   4561 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   4562 
   4563 	aese	q0, v24.16b
   4564 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   4565 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   4566 
   4567 	aese	q3, v22.16b
   4568 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   4569 
   4570 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   4571 
   4572 	aese	q0, v25.16b
   4573 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   4574 
   4575 	aese	q3, v23.16b
   4576 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   4577 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   4578 
   4579 	aese	q1, v23.16b
   4580 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   4581 
   4582 	aese	q0, v26.16b
   4583 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   4584 
   4585 	aese	q2, v23.16b
   4586 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   4587 
   4588 	aese	q1, v24.16b
   4589 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   4590 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   4591 
   4592 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   4593 
   4594 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   4595 
   4596 	aese	q1, v25.16b
   4597 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   4598 
   4599 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   4600 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   4601 
   4602 	aese	q3, v24.16b
   4603 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   4604 	ldp	r19, r20, [r0, #16]           @ AES block 4k+5 - load plaintext
   4605 #ifdef __ARMEB__
   4606 	rev	r19, r19
   4607 	rev	r20, r20
   4608 #endif
   4609 	aese	q1, v26.16b
   4610 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   4611 	mov	d4, v7.d[1]                                  @ GHASH block 4k+3 - mid
   4612 
   4613 	aese	q2, v24.16b
   4614 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   4615 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   4616 
   4617 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   4618 
   4619 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   4620 	eor	q4, q4, q7                          @ GHASH block 4k+3 - mid
   4621 
   4622 	aese	q2, v25.16b
   4623 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   4624 	eor	r19, r19, r13                     @ AES block 4k+5 - round 14 low
   4625 
   4626 	aese	q1, v27.16b
   4627 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   4628 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   4629 
   4630 	aese	q3, v25.16b
   4631 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   4632 	eor	r21, r21, r13                     @ AES block 4k+6 - round 14 low
   4633 
   4634 	aese	q0, v27.16b
   4635 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   4636 	movi	q8, #0xc2
   4637 
   4638 	pmull	v4.1q, q4, v16.1d                          @ GHASH block 4k+3 - mid
   4639 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   4640 	fmov	d5, r19                               @ AES block 4k+5 - mov low
   4641 
   4642 	aese	q2, v26.16b
   4643 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   4644 	ldp	r6, r7, [r0, #0]            @ AES block 4k+4 - load plaintext
   4645 #ifdef __ARMEB__
   4646 	rev	r6, r6
   4647 	rev	r7, r7
   4648 #endif
   4649 	aese	q0, v28.16b
   4650 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   4651 	shl	d8, d8, #56               @ mod_constant
   4652 
   4653 	aese	q3, v26.16b
   4654 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   4655 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   4656 
   4657 	aese	q2, v27.16b
   4658 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   4659 
   4660 	aese	q1, v28.16b
   4661 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   4662 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+3 - mid
   4663 
   4664 	aese	q3, v27.16b
   4665 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   4666 	add	r12, r12, #1                            @ CTR block 4k+3
   4667 
   4668 	aese	q0, v29.16b
   4669 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   4670 	eor	q4, v11.16b, q9                         @ MODULO - karatsuba tidy up
   4671 
   4672 	aese	q1, v29.16b
   4673 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   4674 	add	r0, r0, #64                       @ AES input_ptr update
   4675 
   4676 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   4677 	rev	r9, r12                                 @ CTR block 4k+8
   4678 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   4679 
   4680 	aese	q2, v28.16b
   4681 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   4682 	eor	r6, r6, r13                     @ AES block 4k+4 - round 14 low
   4683 
   4684 	aese	q1, v30.16b
   4685 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   4686 	eor	v10.16b, v10.16b, q4                         @ MODULO - karatsuba tidy up
   4687 
   4688 	aese	q3, v28.16b
   4689 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   4690 	eor	r7, r7, r14                     @ AES block 4k+4 - round 14 high
   4691 
   4692 	fmov	d4, r6                               @ AES block 4k+4 - mov low
   4693 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   4694 	eor	q7, q9, q7                   @ MODULO - fold into mid
   4695 
   4696 	aese	q0, v30.16b
   4697 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   4698 	eor	r20, r20, r14                     @ AES block 4k+5 - round 14 high
   4699 
   4700 	aese	q2, v29.16b
   4701 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   4702 	eor	r24, r24, r14                     @ AES block 4k+7 - round 14 high
   4703 
   4704 	aese	q3, v29.16b
   4705 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   4706 	add	r12, r12, #1                            @ CTR block 4k+8
   4707 
   4708 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   4709 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
   4710 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   4711 
   4712 	aese	q2, v30.16b
   4713 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   4714 	fmov	d7, r23                               @ AES block 4k+7 - mov low
   4715 
   4716 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   4717 	fmov	v5.d[1], r20                           @ AES block 4k+5 - mov high
   4718 
   4719 	fmov	d6, r21                               @ AES block 4k+6 - mov low
   4720 	cmp	r0, r5                   @ .LOOP CONTROL
   4721 
   4722 	fmov	v6.d[1], r22                           @ AES block 4k+6 - mov high
   4723 
   4724 	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
   4725 	eor	q4, q4, q0                          @ AES block 4k+4 - result
   4726 	fmov	d0, r10                               @ CTR block 4k+8
   4727 
   4728 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   4729 	rev	r9, r12                                 @ CTR block 4k+9
   4730 	add	r12, r12, #1                            @ CTR block 4k+9
   4731 
   4732 	eor	q5, q5, q1                          @ AES block 4k+5 - result
   4733 	fmov	d1, r10                               @ CTR block 4k+9
   4734 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   4735 
   4736 	aese	q3, v30.16b
   4737 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   4738 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   4739 
   4740 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   4741 	rev	r9, r12                                 @ CTR block 4k+10
   4742 	st1	{ q4}, [r2], #16                     @ AES block 4k+4 - store result
   4743 
   4744 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   4745 	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
   4746 	fmov	v7.d[1], r24                           @ AES block 4k+7 - mov high
   4747 
   4748 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   4749 	st1	{ q5}, [r2], #16                     @ AES block 4k+5 - store result
   4750 	add	r12, r12, #1                            @ CTR block 4k+10
   4751 
   4752 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   4753 	eor	q6, q6, q2                          @ AES block 4k+6 - result
   4754 	fmov	d2, r10                               @ CTR block 4k+10
   4755 
   4756 	st1	{ q6}, [r2], #16                     @ AES block 4k+6 - store result
   4757 	fmov	v2.d[1], r9                               @ CTR block 4k+10
   4758 	rev	r9, r12                                 @ CTR block 4k+11
   4759 
   4760 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   4761 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+11
   4762 
   4763 	eor	q7, q7, q3                          @ AES block 4k+7 - result
   4764 	st1	{ q7}, [r2], #16                     @ AES block 4k+7 - store result
   4765 	blt	.L256_enc_main_loop
   4766 
   4767 .L256_enc_prepretail:@ PREPRETAIL
   4768 	aese	q1, v18.16b
   4769 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   4770 	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
   4771 
   4772 	aese	q2, v18.16b
   4773 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   4774 	fmov	d3, r10                               @ CTR block 4k+3
   4775 
   4776 	aese	q0, v18.16b
   4777 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   4778 	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
   4779 
   4780 	fmov	v3.d[1], r9                               @ CTR block 4k+3
   4781 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   4782 
   4783 	aese	q2, v19.16b
   4784 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   4785 
   4786 	aese	q0, v19.16b
   4787 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   4788 
   4789 	eor	q4, q4, v11.16b                           @ PRE 1
   4790 	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
   4791 
   4792 	aese	q2, v20.16b
   4793 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   4794 
   4795 	aese	q3, v18.16b
   4796 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   4797 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   4798 
   4799 	aese	q1, v19.16b
   4800 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   4801 
   4802 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   4803 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   4804 
   4805 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   4806 
   4807 	aese	q2, v21.16b
   4808 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   4809 
   4810 	aese	q1, v20.16b
   4811 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   4812 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   4813 
   4814 	aese	q0, v20.16b
   4815 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   4816 
   4817 	aese	q3, v19.16b
   4818 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   4819 
   4820 	aese	q1, v21.16b
   4821 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   4822 
   4823 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   4824 
   4825 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   4826 
   4827 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   4828 
   4829 	aese	q3, v20.16b
   4830 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   4831 
   4832 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   4833 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   4834 
   4835 	aese	q0, v21.16b
   4836 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   4837 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   4838 
   4839 	aese	q3, v21.16b
   4840 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   4841 
   4842 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   4843 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   4844 
   4845 	aese	q0, v22.16b
   4846 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   4847 	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4848 
   4849 	aese	q3, v22.16b
   4850 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   4851 
   4852 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   4853 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   4854 	add	r12, r12, #1                            @ CTR block 4k+3
   4855 
   4856 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   4857 
   4858 	aese	q3, v23.16b
   4859 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   4860 
   4861 	aese	q2, v22.16b
   4862 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   4863 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   4864 
   4865 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   4866 
   4867 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   4868 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   4869 
   4870 	aese	q2, v23.16b
   4871 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   4872 
   4873 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   4874 	mov	d4, v7.d[1]                                  @ GHASH block 4k+3 - mid
   4875 
   4876 	aese	q1, v22.16b
   4877 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   4878 
   4879 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   4880 
   4881 	eor	q4, q4, q7                          @ GHASH block 4k+3 - mid
   4882 
   4883 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   4884 
   4885 	aese	q1, v23.16b
   4886 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   4887 
   4888 	pmull	v4.1q, q4, v16.1d                          @ GHASH block 4k+3 - mid
   4889 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   4890 
   4891 	aese	q0, v23.16b
   4892 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   4893 
   4894 	aese	q1, v24.16b
   4895 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   4896 
   4897 	aese	q2, v24.16b
   4898 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   4899 
   4900 	aese	q0, v24.16b
   4901 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   4902 	movi	q8, #0xc2
   4903 
   4904 	aese	q3, v24.16b
   4905 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   4906 
   4907 	aese	q1, v25.16b
   4908 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   4909 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   4910 
   4911 	aese	q0, v25.16b
   4912 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   4913 
   4914 	aese	q3, v25.16b
   4915 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   4916 	shl	d8, d8, #56               @ mod_constant
   4917 
   4918 	aese	q1, v26.16b
   4919 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   4920 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+3 - mid
   4921 
   4922 	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   4923 
   4924 	aese	q3, v26.16b
   4925 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   4926 
   4927 	aese	q1, v27.16b
   4928 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   4929 
   4930 	aese	q0, v26.16b
   4931 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   4932 	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
   4933 
   4934 	aese	q3, v27.16b
   4935 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   4936 
   4937 	eor	v10.16b, v10.16b, q9                         @ karatsuba tidy up
   4938 
   4939 	pmull	v4.1q, q9, q8
   4940 	ext	q9, q9, q9, #8
   4941 
   4942 	aese	q3, v28.16b
   4943 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   4944 
   4945 	aese	q2, v25.16b
   4946 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   4947 	eor	v10.16b, v10.16b, v11.16b
   4948 
   4949 	aese	q1, v28.16b
   4950 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   4951 
   4952 	aese	q0, v27.16b
   4953 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   4954 
   4955 	aese	q2, v26.16b
   4956 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   4957 
   4958 	aese	q1, v29.16b
   4959 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   4960 	eor	v10.16b, v10.16b, q4
   4961 
   4962 	aese	q0, v28.16b
   4963 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   4964 
   4965 	aese	q2, v27.16b
   4966 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   4967 
   4968 	aese	q1, v30.16b
   4969 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   4970 
   4971 	aese	q0, v29.16b
   4972 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   4973 	eor	v10.16b, v10.16b, q9
   4974 
   4975 	aese	q3, v29.16b
   4976 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   4977 
   4978 	aese	q2, v28.16b
   4979 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   4980 
   4981 	aese	q0, v30.16b
   4982 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   4983 
   4984 	pmull	v4.1q, v10.1d, q8
   4985 
   4986 	aese	q2, v29.16b
   4987 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   4988 	ext	v10.16b, v10.16b, v10.16b, #8
   4989 
   4990 	aese	q3, v30.16b
   4991 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   4992 
   4993 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   4994 	eor	v11.16b, v11.16b, q4
   4995 
   4996 	aese	q2, v30.16b
   4997 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   4998 
   4999 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   5000 
   5001 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   5002 
   5003 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   5004 	eor	v11.16b, v11.16b, v10.16b
   5005 .L256_enc_tail:@ TAIL
   5006 
   5007 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   5008 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   5009 	ldp	r6, r7, [r0], #16           @ AES block 4k+4 - load plaintext
   5010 #ifdef __ARMEB__
   5011 	rev	r6, r6
   5012 	rev	r7, r7
   5013 #endif
   5014 	eor	r6, r6, r13                     @ AES block 4k+4 - round 14 low
   5015 	eor	r7, r7, r14                     @ AES block 4k+4 - round 14 high
   5016 
   5017 	cmp	r5, #48
   5018 	fmov	d4, r6                               @ AES block 4k+4 - mov low
   5019 
   5020 	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
   5021 
   5022 	eor	q5, q4, q0                          @ AES block 4k+4 - result
   5023 	bgt	.L256_enc_blocks_more_than_3
   5024 
   5025 	cmp	r5, #32
   5026 	mov	q3, q2
   5027 	movi	v11.8b, #0
   5028 
   5029 	movi	q9, #0
   5030 	sub	r12, r12, #1
   5031 
   5032 	mov	q2, q1
   5033 	movi	v10.8b, #0
   5034 	bgt	.L256_enc_blocks_more_than_2
   5035 
   5036 	mov	q3, q1
   5037 	sub	r12, r12, #1
   5038 	cmp	r5, #16
   5039 
   5040 	bgt	.L256_enc_blocks_more_than_1
   5041 
   5042 	sub	r12, r12, #1
   5043 	b	.L256_enc_blocks_less_than_1
   5044 .L256_enc_blocks_more_than_3:@ blocks left >  3
   5045 	st1	{ q5}, [r2], #16                    @ AES final-3 block  - store result
   5046 
   5047 	ldp	r6, r7, [r0], #16          @ AES final-2 block - load input low & high
   5048 #ifdef __ARMEB__
   5049 	rev	r6, r6
   5050 	rev	r7, r7
   5051 #endif
   5052 	rev64	q4, q5                                   @ GHASH final-3 block
   5053 
   5054 	eor	r6, r6, r13                    @ AES final-2 block - round 14 low
   5055 	eor	q4, q4, q8                          @ feed in partial tag
   5056 
   5057 	eor	r7, r7, r14                    @ AES final-2 block - round 14 high
   5058 
   5059 	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
   5060 	fmov	d5, r6                                @ AES final-2 block - mov low
   5061 
   5062 	fmov	v5.d[1], r7                            @ AES final-2 block - mov high
   5063 
   5064 	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
   5065 	movi	q8, #0                                       @ suppress further partial tag feed in
   5066 
   5067 	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
   5068 
   5069 	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
   5070 
   5071 	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
   5072 
   5073 	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
   5074 	eor	q5, q5, q1                           @ AES final-2 block - result
   5075 .L256_enc_blocks_more_than_2:@ blocks left >  2
   5076 
   5077 	st1	{ q5}, [r2], #16                    @ AES final-2 block - store result
   5078 
   5079 	ldp	r6, r7, [r0], #16          @ AES final-1 block - load input low & high
   5080 #ifdef __ARMEB__
   5081 	rev	r6, r6
   5082 	rev	r7, r7
   5083 #endif
   5084 	rev64	q4, q5                                   @ GHASH final-2 block
   5085 
   5086 	eor	r6, r6, r13                    @ AES final-1 block - round 14 low
   5087 	eor	q4, q4, q8                          @ feed in partial tag
   5088 
   5089 	fmov	d5, r6                                @ AES final-1 block - mov low
   5090 	eor	r7, r7, r14                    @ AES final-1 block - round 14 high
   5091 
   5092 	fmov	v5.d[1], r7                            @ AES final-1 block - mov high
   5093 
   5094 	movi	q8, #0                                       @ suppress further partial tag feed in
   5095 
   5096 	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
   5097 	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
   5098 
   5099 	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
   5100 
   5101 	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
   5102 
   5103 	eor	q5, q5, q2                           @ AES final-1 block - result
   5104 
   5105 	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
   5106 
   5107 	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
   5108 
   5109 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
   5110 
   5111 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
   5112 .L256_enc_blocks_more_than_1:@ blocks left >  1
   5113 
   5114 	st1	{ q5}, [r2], #16                    @ AES final-1 block - store result
   5115 
   5116 	rev64	q4, q5                                   @ GHASH final-1 block
   5117 
   5118 	ldp	r6, r7, [r0], #16          @ AES final block - load input low & high
   5119 #ifdef __ARMEB__
   5120 	rev	r6, r6
   5121 	rev	r7, r7
   5122 #endif
   5123 	eor	q4, q4, q8                          @ feed in partial tag
   5124 
   5125 	movi	q8, #0                                       @ suppress further partial tag feed in
   5126 
   5127 	eor	r6, r6, r13                    @ AES final block - round 14 low
   5128 	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
   5129 
   5130 	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
   5131 	eor	r7, r7, r14                    @ AES final block - round 14 high
   5132 
   5133 	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
   5134 
   5135 	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
   5136 
   5137 	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
   5138 	fmov	d5, r6                                @ AES final block - mov low
   5139 
   5140 	fmov	v5.d[1], r7                            @ AES final block - mov high
   5141 
   5142 	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
   5143 
   5144 	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
   5145 
   5146 	eor	q5, q5, q3                           @ AES final block - result
   5147 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
   5148 
   5149 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
   5150 .L256_enc_blocks_less_than_1:@ blocks left <= 1
   5151 
   5152 	and	r1, r1, #127                   @ bit_length %= 128
   5153 
   5154 	mvn	r13, xzr                                     @ rk14_l = 0xffffffffffffffff
   5155 	sub	r1, r1, #128                   @ bit_length -= 128
   5156 
   5157 	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
   5158 	ld1	{ v18.16b}, [r2]                           @ load existing bytes where the possibly partial last block is to be stored
   5159 
   5160 	mvn	r14, xzr                                     @ rk14_h = 0xffffffffffffffff
   5161 	and	r1, r1, #127                   @ bit_length %= 128
   5162 
   5163 	lsr	r14, r14, r1                    @ rk14_h is mask for top 64b of last block
   5164 	cmp	r1, #64
   5165 
   5166 	csel	r6, r13, r14, lt
   5167 	csel	r7, r14, xzr, lt
   5168 
   5169 	fmov	d0, r6                                @ ctr0b is mask for last block
   5170 
   5171 	fmov	v0.d[1], r7
   5172 
   5173 	and	q5, q5, q0                           @ possibly partial last block has zeroes in highest bits
   5174 
   5175 	rev64	q4, q5                                   @ GHASH final block
   5176 
   5177 	eor	q4, q4, q8                          @ feed in partial tag
   5178 
   5179 	bif	q5, v18.16b, q0                             @ insert existing bytes in top end of result before storing
   5180 
   5181 	pmull2	v20.1q, q4, v12.2d                         @ GHASH final block - high
   5182 	mov	d8, v4.d[1]                                 @ GHASH final block - mid
   5183 #ifndef __ARMEB__
   5184 	rev	r9, r12
   5185 #else
   5186 	mov	r9, r12
   5187 #endif
   5188 
   5189 	pmull	v21.1q, q4, v12.1d                         @ GHASH final block - low
   5190 
   5191 	eor	q9, q9, v20.16b                           @ GHASH final block - high
   5192 	eor	q8, q8, q4                         @ GHASH final block - mid
   5193 
   5194 	pmull	v8.1q, q8, v16.1d                         @ GHASH final block - mid
   5195 
   5196 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final block - low
   5197 
   5198 	eor	v10.16b, v10.16b, q8                        @ GHASH final block - mid
   5199 	movi	q8, #0xc2
   5200 
   5201 	eor	q4, v11.16b, q9                        @ MODULO - karatsuba tidy up
   5202 
   5203 	shl	d8, d8, #56              @ mod_constant
   5204 
   5205 	eor	v10.16b, v10.16b, q4                        @ MODULO - karatsuba tidy up
   5206 
   5207 	pmull	v7.1q, q9, q8           @ MODULO - top 64b align with mid
   5208 
   5209 	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
   5210 
   5211 	eor	v10.16b, v10.16b, q7                     @ MODULO - fold into mid
   5212 
   5213 	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
   5214 
   5215 	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
   5216 
   5217 	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
   5218 
   5219 	str	r9, [r16, #12]                         @ store the updated counter
   5220 
   5221 	st1	{ q5}, [r2]                         @ store all 16B
   5222 	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
   5223 
   5224 	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
   5225 	ext	v11.16b, v11.16b, v11.16b, #8
   5226 	rev64	v11.16b, v11.16b
   5227 	mov	r0, r15
   5228 	st1	{ v11.16b }, [r3]
   5229 
   5230 	ldp	r21, r22, [sp, #16]
   5231 	ldp	r23, r24, [sp, #32]
   5232 	ldp	d8, d9, [sp, #48]
   5233 	ldp	d10, d11, [sp, #64]
   5234 	ldp	d12, d13, [sp, #80]
   5235 	ldp	d14, d15, [sp, #96]
   5236 	ldp	r19, r20, [sp], #112
   5237 	RET
   5238 
   5239 .L256_enc_ret:
   5240 	mov	r0, #0x0
   5241 	RET
   5242 .size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
   5243 .globl	aes_gcm_dec_256_kernel
   5244 .type	aes_gcm_dec_256_kernel,%function
   5245 .align	4
   5246 aes_gcm_dec_256_kernel:
   5247 	AARCH64_VALID_CALL_TARGET
   5248 	cbz	r1, .L256_dec_ret
   5249 	stp	r19, r20, [sp, #-112]!
   5250 	mov	r16, r4
   5251 	mov	r8, r5
   5252 	stp	r21, r22, [sp, #16]
   5253 	stp	r23, r24, [sp, #32]
   5254 	stp	d8, d9, [sp, #48]
   5255 	stp	d10, d11, [sp, #64]
   5256 	stp	d12, d13, [sp, #80]
   5257 	stp	d14, d15, [sp, #96]
   5258 
   5259 	lsr	r5, r1, #3              @ byte_len
   5260 	mov	r15, r5
   5261 	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
   5262 #ifdef __ARMEB__
   5263 	rev	r10, r10
   5264 	rev	r11, r11
   5265 #endif
   5266 	ldp	r13, r14, [r8, #224]                     @ load rk14
   5267 #ifdef __ARMEB__
   5268 	ror	r14, r14, #32
   5269 	ror	r13, r13, #32
   5270 #endif
   5271 	ld1	{v18.4s}, [r8], #16                               @ load rk0
   5272 	sub	r5, r5, #1      @ byte_len - 1
   5273 
   5274 	ld1	{v19.4s}, [r8], #16                               @ load rk1
   5275 	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   5276 
   5277 	add	r4, r0, r1, lsr #3   @ end_input_ptr
   5278 	ld1	{v20.4s}, [r8], #16                               @ load rk2
   5279 
   5280 	lsr	r12, r11, #32
   5281 	ld1	{v21.4s}, [r8], #16                               @ load rk3
   5282 	orr	r11, r11, r11
   5283 
   5284 	ld1	{v22.4s}, [r8], #16                               @ load rk4
   5285 	add	r5, r5, r0
   5286 	rev	r12, r12                                @ rev_ctr32
   5287 
   5288 	add	r12, r12, #1                            @ increment rev_ctr32
   5289 	fmov	d3, r10                               @ CTR block 3
   5290 
   5291 	rev	r9, r12                                 @ CTR block 1
   5292 	add	r12, r12, #1                            @ CTR block 1
   5293 	fmov	d1, r10                               @ CTR block 1
   5294 
   5295 	orr	r9, r11, r9, lsl #32            @ CTR block 1
   5296 	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
   5297 
   5298 	fmov	v1.d[1], r9                               @ CTR block 1
   5299 	rev	r9, r12                                 @ CTR block 2
   5300 	add	r12, r12, #1                            @ CTR block 2
   5301 
   5302 	fmov	d2, r10                               @ CTR block 2
   5303 	orr	r9, r11, r9, lsl #32            @ CTR block 2
   5304 
   5305 	fmov	v2.d[1], r9                               @ CTR block 2
   5306 	rev	r9, r12                                 @ CTR block 3
   5307 
   5308 	orr	r9, r11, r9, lsl #32            @ CTR block 3
   5309 	ld1	{v23.4s}, [r8], #16                               @ load rk5
   5310 
   5311 	fmov	v3.d[1], r9                               @ CTR block 3
   5312 	add	r12, r12, #1                            @ CTR block 3
   5313 
   5314 	ld1	{v24.4s}, [r8], #16                               @ load rk6
   5315 
   5316 	ld1	{v25.4s}, [r8], #16                               @ load rk7
   5317 
   5318 	ld1	{v26.4s}, [r8], #16                               @ load rk8
   5319 
   5320 	aese	q0, v18.16b
   5321 	aesmc	q0, q0          @ AES block 0 - round 0
   5322 	ldr	q14, [r3, #80]                         @ load h3l | h3h
   5323 #ifndef __ARMEB__
   5324 	ext	v14.16b, v14.16b, v14.16b, #8
   5325 #endif
   5326 
   5327 	aese	q3, v18.16b
   5328 	aesmc	q3, q3          @ AES block 3 - round 0
   5329 	ldr	q15, [r3, #112]                        @ load h4l | h4h
   5330 #ifndef __ARMEB__
   5331 	ext	v15.16b, v15.16b, v15.16b, #8
   5332 #endif
   5333 
   5334 	aese	q1, v18.16b
   5335 	aesmc	q1, q1          @ AES block 1 - round 0
   5336 	ldr	q13, [r3, #64]                         @ load h2l | h2h
   5337 #ifndef __ARMEB__
   5338 	ext	v13.16b, v13.16b, v13.16b, #8
   5339 #endif
   5340 
   5341 	aese	q2, v18.16b
   5342 	aesmc	q2, q2          @ AES block 2 - round 0
   5343 	ld1	{v27.4s}, [r8], #16                                 @ load rk9
   5344 
   5345 	aese	q0, v19.16b
   5346 	aesmc	q0, q0          @ AES block 0 - round 1
   5347 
   5348 	aese	q1, v19.16b
   5349 	aesmc	q1, q1          @ AES block 1 - round 1
   5350 	ld1	{ v11.16b}, [r3]
   5351 	ext	v11.16b, v11.16b, v11.16b, #8
   5352 	rev64	v11.16b, v11.16b
   5353 
   5354 	aese	q2, v19.16b
   5355 	aesmc	q2, q2          @ AES block 2 - round 1
   5356 	ld1	{v28.4s}, [r8], #16                              @ load rk10
   5357 
   5358 	aese	q3, v19.16b
   5359 	aesmc	q3, q3          @ AES block 3 - round 1
   5360 	ld1	{v29.4s}, [r8], #16                              @ load rk11
   5361 
   5362 	aese	q0, v20.16b
   5363 	aesmc	q0, q0          @ AES block 0 - round 2
   5364 	ldr	q12, [r3, #32]                         @ load h1l | h1h
   5365 #ifndef __ARMEB__
   5366 	ext	v12.16b, v12.16b, v12.16b, #8
   5367 #endif
   5368 	aese	q2, v20.16b
   5369 	aesmc	q2, q2          @ AES block 2 - round 2
   5370 	ld1	{v30.4s}, [r8], #16                              @ load rk12
   5371 
   5372 	aese	q3, v20.16b
   5373 	aesmc	q3, q3          @ AES block 3 - round 2
   5374 
   5375 	aese	q0, v21.16b
   5376 	aesmc	q0, q0          @ AES block 0 - round 3
   5377 
   5378 	aese	q1, v20.16b
   5379 	aesmc	q1, q1          @ AES block 1 - round 2
   5380 
   5381 	aese	q3, v21.16b
   5382 	aesmc	q3, q3          @ AES block 3 - round 3
   5383 
   5384 	aese	q0, v22.16b
   5385 	aesmc	q0, q0          @ AES block 0 - round 4
   5386 	cmp	r0, r5                   @ check if we have <= 4 blocks
   5387 
   5388 	aese	q2, v21.16b
   5389 	aesmc	q2, q2          @ AES block 2 - round 3
   5390 
   5391 	aese	q1, v21.16b
   5392 	aesmc	q1, q1          @ AES block 1 - round 3
   5393 
   5394 	aese	q3, v22.16b
   5395 	aesmc	q3, q3          @ AES block 3 - round 4
   5396 
   5397 	aese	q2, v22.16b
   5398 	aesmc	q2, q2          @ AES block 2 - round 4
   5399 
   5400 	aese	q1, v22.16b
   5401 	aesmc	q1, q1          @ AES block 1 - round 4
   5402 
   5403 	aese	q3, v23.16b
   5404 	aesmc	q3, q3          @ AES block 3 - round 5
   5405 
   5406 	aese	q0, v23.16b
   5407 	aesmc	q0, q0          @ AES block 0 - round 5
   5408 
   5409 	aese	q1, v23.16b
   5410 	aesmc	q1, q1          @ AES block 1 - round 5
   5411 
   5412 	aese	q2, v23.16b
   5413 	aesmc	q2, q2          @ AES block 2 - round 5
   5414 
   5415 	aese	q0, v24.16b
   5416 	aesmc	q0, q0          @ AES block 0 - round 6
   5417 
   5418 	aese	q3, v24.16b
   5419 	aesmc	q3, q3          @ AES block 3 - round 6
   5420 
   5421 	aese	q1, v24.16b
   5422 	aesmc	q1, q1          @ AES block 1 - round 6
   5423 
   5424 	aese	q2, v24.16b
   5425 	aesmc	q2, q2          @ AES block 2 - round 6
   5426 
   5427 	aese	q0, v25.16b
   5428 	aesmc	q0, q0          @ AES block 0 - round 7
   5429 
   5430 	aese	q1, v25.16b
   5431 	aesmc	q1, q1          @ AES block 1 - round 7
   5432 
   5433 	aese	q3, v25.16b
   5434 	aesmc	q3, q3          @ AES block 3 - round 7
   5435 
   5436 	aese	q0, v26.16b
   5437 	aesmc	q0, q0          @ AES block 0 - round 8
   5438 
   5439 	aese	q2, v25.16b
   5440 	aesmc	q2, q2          @ AES block 2 - round 7
   5441 
   5442 	aese	q3, v26.16b
   5443 	aesmc	q3, q3          @ AES block 3 - round 8
   5444 
   5445 	aese	q1, v26.16b
   5446 	aesmc	q1, q1          @ AES block 1 - round 8
   5447 
   5448 	aese	q0, v27.16b
   5449 	aesmc	q0, q0          @ AES block 0 - round 9
   5450 
   5451 	aese	q2, v26.16b
   5452 	aesmc	q2, q2          @ AES block 2 - round 8
   5453 	ld1	{v31.4s}, [r8], #16                             @ load rk13
   5454 
   5455 	aese	q1, v27.16b
   5456 	aesmc	q1, q1          @ AES block 1 - round 9
   5457 
   5458 	aese	q0, v28.16b
   5459 	aesmc	q0, q0          @ AES block 0 - round 10
   5460 
   5461 	aese	q3, v27.16b
   5462 	aesmc	q3, q3          @ AES block 3 - round 9
   5463 
   5464 	aese	q1, v28.16b
   5465 	aesmc	q1, q1          @ AES block 1 - round 10
   5466 
   5467 	aese	q2, v27.16b
   5468 	aesmc	q2, q2          @ AES block 2 - round 9
   5469 
   5470 	aese	q3, v28.16b
   5471 	aesmc	q3, q3          @ AES block 3 - round 10
   5472 
   5473 	aese	q0, v29.16b
   5474 	aesmc	q0, q0          @ AES block 0 - round 11
   5475 
   5476 	aese	q2, v28.16b
   5477 	aesmc	q2, q2          @ AES block 2 - round 10
   5478 
   5479 	aese	q3, v29.16b
   5480 	aesmc	q3, q3          @ AES block 3 - round 11
   5481 
   5482 	aese	q1, v29.16b
   5483 	aesmc	q1, q1          @ AES block 1 - round 11
   5484 
   5485 	aese	q2, v29.16b
   5486 	aesmc	q2, q2          @ AES block 2 - round 11
   5487 
   5488 	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
   5489 
   5490 	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
   5491 
   5492 	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
   5493 	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
   5494 
   5495 	aese	q1, v30.16b
   5496 	aesmc	q1, q1          @ AES block 1 - round 12
   5497 
   5498 	aese	q0, v30.16b
   5499 	aesmc	q0, q0          @ AES block 0 - round 12
   5500 
   5501 	aese	q2, v30.16b
   5502 	aesmc	q2, q2          @ AES block 2 - round 12
   5503 
   5504 	aese	q3, v30.16b
   5505 	aesmc	q3, q3          @ AES block 3 - round 12
   5506 	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
   5507 
   5508 	aese	q1, v31.16b                                     @ AES block 1 - round 13
   5509 
   5510 	aese	q2, v31.16b                                     @ AES block 2 - round 13
   5511 	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
   5512 
   5513 	aese	q3, v31.16b                                     @ AES block 3 - round 13
   5514 
   5515 	aese	q0, v31.16b                                     @ AES block 0 - round 13
   5516 	bge	.L256_dec_tail                                    @ handle tail
   5517 
   5518 	ld1	{q4, q5}, [r0], #32               @ AES block 0,1 - load ciphertext
   5519 
   5520 	rev	r9, r12                                 @ CTR block 4
   5521 
   5522 	eor	q0, q4, q0                            @ AES block 0 - result
   5523 
   5524 	eor	q1, q5, q1                            @ AES block 1 - result
   5525 	rev64	q5, q5                                    @ GHASH block 1
   5526 	ld1	{q6}, [r0], #16                       @ AES block 2 - load ciphertext
   5527 
   5528 	mov	r7, v0.d[1]                            @ AES block 0 - mov high
   5529 
   5530 	mov	r6, v0.d[0]                            @ AES block 0 - mov low
   5531 	rev64	q4, q4                                    @ GHASH block 0
   5532 	add	r12, r12, #1                            @ CTR block 4
   5533 
   5534 	fmov	d0, r10                               @ CTR block 4
   5535 	orr	r9, r11, r9, lsl #32            @ CTR block 4
   5536 
   5537 	fmov	v0.d[1], r9                               @ CTR block 4
   5538 	rev	r9, r12                                 @ CTR block 5
   5539 	add	r12, r12, #1                            @ CTR block 5
   5540 
   5541 	mov	r19, v1.d[0]                            @ AES block 1 - mov low
   5542 
   5543 	orr	r9, r11, r9, lsl #32            @ CTR block 5
   5544 	mov	r20, v1.d[1]                            @ AES block 1 - mov high
   5545 	eor	r7, r7, r14                   @ AES block 0 - round 14 high
   5546 #ifdef __ARMEB__
   5547 	rev	r7, r7
   5548 #endif
   5549 	eor	r6, r6, r13                   @ AES block 0 - round 14 low
   5550 #ifdef __ARMEB__
   5551 	rev	r6, r6
   5552 #endif
   5553 	stp	r6, r7, [r2], #16        @ AES block 0 - store result
   5554 	fmov	d1, r10                               @ CTR block 5
   5555 
   5556 	ld1	{q7}, [r0], #16                       @ AES block 3 - load ciphertext
   5557 
   5558 	fmov	v1.d[1], r9                               @ CTR block 5
   5559 	rev	r9, r12                                 @ CTR block 6
   5560 	add	r12, r12, #1                            @ CTR block 6
   5561 
   5562 	eor	r19, r19, r13                   @ AES block 1 - round 14 low
   5563 #ifdef __ARMEB__
   5564 	rev	r19, r19
   5565 #endif
   5566 	orr	r9, r11, r9, lsl #32            @ CTR block 6
   5567 
   5568 	eor	r20, r20, r14                   @ AES block 1 - round 14 high
   5569 #ifdef __ARMEB__
   5570 	rev	r20, r20
   5571 #endif
   5572 	stp	r19, r20, [r2], #16        @ AES block 1 - store result
   5573 
   5574 	eor	q2, q6, q2                            @ AES block 2 - result
   5575 	cmp	r0, r5                   @ check if we have <= 8 blocks
   5576 	bge	.L256_dec_prepretail                              @ do prepretail
   5577 
   5578 .L256_dec_main_loop:@ main loop start
   5579 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   5580 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   5581 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   5582 
   5583 	aese	q0, v18.16b
   5584 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   5585 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   5586 
   5587 	aese	q1, v18.16b
   5588 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   5589 	fmov	d2, r10                               @ CTR block 4k+6
   5590 
   5591 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   5592 	eor	q4, q4, v11.16b                           @ PRE 1
   5593 	rev	r9, r12                                 @ CTR block 4k+7
   5594 
   5595 	aese	q0, v19.16b
   5596 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   5597 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   5598 
   5599 	aese	q1, v19.16b
   5600 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   5601 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   5602 
   5603 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   5604 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   5605 	fmov	d3, r10                               @ CTR block 4k+7
   5606 
   5607 	aese	q0, v20.16b
   5608 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   5609 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   5610 
   5611 	aese	q2, v18.16b
   5612 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   5613 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   5614 
   5615 	aese	q1, v20.16b
   5616 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   5617 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   5618 
   5619 	aese	q0, v21.16b
   5620 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   5621 	eor	r22, r22, r14                   @ AES block 4k+2 - round 14 high
   5622 #ifdef __ARMEB__
   5623 	rev	r22, r22
   5624 #endif
   5625 	aese	q2, v19.16b
   5626 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   5627 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   5628 
   5629 	aese	q1, v21.16b
   5630 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   5631 	rev64	q6, q6                                    @ GHASH block 4k+2
   5632 
   5633 	aese	q3, v18.16b
   5634 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   5635 	eor	r21, r21, r13                   @ AES block 4k+2 - round 14 low
   5636 #ifdef __ARMEB__
   5637 	rev	r21, r21
   5638 #endif
   5639 	aese	q2, v20.16b
   5640 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   5641 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   5642 
   5643 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   5644 
   5645 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   5646 
   5647 	aese	q2, v21.16b
   5648 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   5649 	rev64	q7, q7                                    @ GHASH block 4k+3
   5650 
   5651 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   5652 	eor	r23, r23, r13                   @ AES block 4k+3 - round 14 low
   5653 #ifdef __ARMEB__
   5654 	rev	r23, r23
   5655 #endif
   5656 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   5657 	eor	r24, r24, r14                   @ AES block 4k+3 - round 14 high
   5658 #ifdef __ARMEB__
   5659 	rev	r24, r24
   5660 #endif
   5661 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   5662 
   5663 	aese	q2, v22.16b
   5664 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   5665 
   5666 	aese	q3, v19.16b
   5667 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   5668 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   5669 
   5670 	aese	q0, v22.16b
   5671 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   5672 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   5673 
   5674 	aese	q2, v23.16b
   5675 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   5676 	add	r12, r12, #1                            @ CTR block 4k+7
   5677 
   5678 	aese	q3, v20.16b
   5679 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   5680 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   5681 
   5682 	aese	q1, v22.16b
   5683 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   5684 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   5685 
   5686 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   5687 
   5688 	aese	q3, v21.16b
   5689 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   5690 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   5691 
   5692 	aese	q1, v23.16b
   5693 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   5694 
   5695 	aese	q0, v23.16b
   5696 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   5697 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   5698 
   5699 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   5700 	rev	r9, r12                                 @ CTR block 4k+8
   5701 
   5702 	aese	q1, v24.16b
   5703 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   5704 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   5705 
   5706 	aese	q0, v24.16b
   5707 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   5708 	add	r12, r12, #1                            @ CTR block 4k+8
   5709 
   5710 	aese	q3, v22.16b
   5711 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   5712 
   5713 	aese	q1, v25.16b
   5714 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   5715 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   5716 
   5717 	aese	q0, v25.16b
   5718 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   5719 
   5720 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   5721 	mov	d6, v7.d[1]                                  @ GHASH block 4k+3 - mid
   5722 
   5723 	aese	q3, v23.16b
   5724 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   5725 
   5726 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   5727 
   5728 	aese	q0, v26.16b
   5729 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   5730 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   5731 
   5732 	aese	q3, v24.16b
   5733 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   5734 
   5735 	pmull	v4.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   5736 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
   5737 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   5738 
   5739 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   5740 
   5741 	aese	q0, v27.16b
   5742 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   5743 	eor	q6, q6, q7                          @ GHASH block 4k+3 - mid
   5744 
   5745 	aese	q1, v26.16b
   5746 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   5747 
   5748 	aese	q2, v24.16b
   5749 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   5750 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   5751 
   5752 	aese	q0, v28.16b
   5753 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   5754 
   5755 	pmull	v6.1q, q6, v16.1d                          @ GHASH block 4k+3 - mid
   5756 	movi	q8, #0xc2
   5757 
   5758 	aese	q2, v25.16b
   5759 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   5760 	eor	v11.16b, v11.16b, q4                         @ GHASH block 4k+3 - low
   5761 
   5762 	aese	q0, v29.16b
   5763 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   5764 
   5765 	aese	q3, v25.16b
   5766 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   5767 	shl	d8, d8, #56               @ mod_constant
   5768 
   5769 	aese	q2, v26.16b
   5770 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   5771 	eor	v10.16b, v10.16b, q6                         @ GHASH block 4k+3 - mid
   5772 
   5773 	aese	q0, v30.16b
   5774 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   5775 
   5776 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   5777 	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
   5778 
   5779 	aese	q1, v27.16b
   5780 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   5781 	ld1	{q4}, [r0], #16                       @ AES block 4k+4 - load ciphertext
   5782 
   5783 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   5784 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   5785 
   5786 	aese	q1, v28.16b
   5787 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   5788 	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
   5789 
   5790 	aese	q2, v27.16b
   5791 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   5792 	ld1	{q5}, [r0], #16                       @ AES block 4k+5 - load ciphertext
   5793 
   5794 	aese	q3, v26.16b
   5795 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   5796 	eor	q0, q4, q0                            @ AES block 4k+4 - result
   5797 
   5798 	aese	q1, v29.16b
   5799 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   5800 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   5801 
   5802 	aese	q2, v28.16b
   5803 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   5804 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   5805 
   5806 	aese	q3, v27.16b
   5807 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   5808 	ld1	{q6}, [r0], #16                       @ AES block 4k+6 - load ciphertext
   5809 
   5810 	aese	q1, v30.16b
   5811 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   5812 	ld1	{q7}, [r0], #16                       @ AES block 4k+7 - load ciphertext
   5813 
   5814 	aese	q2, v29.16b
   5815 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   5816 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   5817 
   5818 	aese	q3, v28.16b
   5819 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   5820 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   5821 
   5822 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   5823 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   5824 
   5825 	aese	q2, v30.16b
   5826 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   5827 	fmov	d0, r10                               @ CTR block 4k+8
   5828 
   5829 	aese	q3, v29.16b
   5830 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   5831 	fmov	v0.d[1], r9                               @ CTR block 4k+8
   5832 
   5833 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   5834 	eor	q1, q5, q1                            @ AES block 4k+5 - result
   5835 	rev	r9, r12                                 @ CTR block 4k+9
   5836 
   5837 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   5838 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
   5839 	cmp	r0, r5                   @ .LOOP CONTROL
   5840 
   5841 	add	r12, r12, #1                            @ CTR block 4k+9
   5842 
   5843 	eor	r6, r6, r13                   @ AES block 4k+4 - round 14 low
   5844 #ifdef __ARMEB__
   5845 	rev	r6, r6
   5846 #endif
   5847 	eor	r7, r7, r14                   @ AES block 4k+4 - round 14 high
   5848 #ifdef __ARMEB__
   5849 	rev	r7, r7
   5850 #endif
   5851 	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
   5852 	eor	q2, q6, q2                            @ AES block 4k+6 - result
   5853 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   5854 
   5855 	aese	q3, v30.16b
   5856 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   5857 	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
   5858 
   5859 	fmov	d1, r10                               @ CTR block 4k+9
   5860 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   5861 
   5862 	fmov	v1.d[1], r9                               @ CTR block 4k+9
   5863 	rev	r9, r12                                 @ CTR block 4k+10
   5864 	add	r12, r12, #1                            @ CTR block 4k+10
   5865 
   5866 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   5867 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
   5868 
   5869 	rev64	q5, q5                                    @ GHASH block 4k+5
   5870 	eor	r20, r20, r14                   @ AES block 4k+5 - round 14 high
   5871 #ifdef __ARMEB__
   5872 	rev	r20, r20
   5873 #endif
   5874 	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
   5875 
   5876 	eor	r19, r19, r13                   @ AES block 4k+5 - round 14 low
   5877 #ifdef __ARMEB__
   5878 	rev	r19, r19
   5879 #endif
   5880 	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
   5881 
   5882 	rev64	q4, q4                                    @ GHASH block 4k+4
   5883 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   5884 	blt	.L256_dec_main_loop
   5885 
   5886 
   5887 .L256_dec_prepretail:@ PREPRETAIL
   5888 	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
   5889 	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
   5890 	eor	q3, q7, q3                            @ AES block 4k+3 - result
   5891 
   5892 	aese	q0, v18.16b
   5893 	aesmc	q0, q0          @ AES block 4k+4 - round 0
   5894 	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
   5895 
   5896 	aese	q1, v18.16b
   5897 	aesmc	q1, q1          @ AES block 4k+5 - round 0
   5898 	fmov	d2, r10                               @ CTR block 4k+6
   5899 
   5900 	fmov	v2.d[1], r9                               @ CTR block 4k+6
   5901 	rev	r9, r12                                 @ CTR block 4k+7
   5902 	eor	q4, q4, v11.16b                           @ PRE 1
   5903 
   5904 	rev64	q6, q6                                    @ GHASH block 4k+2
   5905 	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
   5906 	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
   5907 
   5908 	aese	q1, v19.16b
   5909 	aesmc	q1, q1          @ AES block 4k+5 - round 1
   5910 	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
   5911 
   5912 	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
   5913 	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
   5914 	fmov	d3, r10                               @ CTR block 4k+7
   5915 
   5916 	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
   5917 	fmov	v3.d[1], r9                               @ CTR block 4k+7
   5918 
   5919 	aese	q2, v18.16b
   5920 	aesmc	q2, q2          @ AES block 4k+6 - round 0
   5921 	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
   5922 
   5923 	aese	q0, v19.16b
   5924 	aesmc	q0, q0          @ AES block 4k+4 - round 1
   5925 	eor	q8, q8, q4                          @ GHASH block 4k - mid
   5926 
   5927 	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
   5928 
   5929 	aese	q2, v19.16b
   5930 	aesmc	q2, q2          @ AES block 4k+6 - round 1
   5931 	rev64	q7, q7                                    @ GHASH block 4k+3
   5932 
   5933 	aese	q3, v18.16b
   5934 	aesmc	q3, q3          @ AES block 4k+7 - round 0
   5935 
   5936 	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
   5937 	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
   5938 
   5939 	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
   5940 
   5941 	aese	q3, v19.16b
   5942 	aesmc	q3, q3          @ AES block 4k+7 - round 1
   5943 	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
   5944 
   5945 	aese	q0, v20.16b
   5946 	aesmc	q0, q0          @ AES block 4k+4 - round 2
   5947 
   5948 	aese	q1, v20.16b
   5949 	aesmc	q1, q1          @ AES block 4k+5 - round 2
   5950 	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
   5951 
   5952 	aese	q2, v20.16b
   5953 	aesmc	q2, q2          @ AES block 4k+6 - round 2
   5954 
   5955 	aese	q0, v21.16b
   5956 	aesmc	q0, q0          @ AES block 4k+4 - round 3
   5957 	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
   5958 
   5959 	aese	q3, v20.16b
   5960 	aesmc	q3, q3          @ AES block 4k+7 - round 2
   5961 	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
   5962 
   5963 	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
   5964 
   5965 	aese	q0, v22.16b
   5966 	aesmc	q0, q0          @ AES block 4k+4 - round 4
   5967 
   5968 	aese	q3, v21.16b
   5969 	aesmc	q3, q3          @ AES block 4k+7 - round 3
   5970 	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
   5971 
   5972 	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
   5973 
   5974 	aese	q0, v23.16b
   5975 	aesmc	q0, q0          @ AES block 4k+4 - round 5
   5976 	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
   5977 
   5978 	aese	q3, v22.16b
   5979 	aesmc	q3, q3          @ AES block 4k+7 - round 4
   5980 
   5981 	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
   5982 	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
   5983 
   5984 	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
   5985 
   5986 	aese	q3, v23.16b
   5987 	aesmc	q3, q3          @ AES block 4k+7 - round 5
   5988 	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
   5989 
   5990 	aese	q2, v21.16b
   5991 	aesmc	q2, q2          @ AES block 4k+6 - round 3
   5992 
   5993 	aese	q1, v21.16b
   5994 	aesmc	q1, q1          @ AES block 4k+5 - round 3
   5995 	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
   5996 
   5997 	pmull	v4.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
   5998 
   5999 	aese	q2, v22.16b
   6000 	aesmc	q2, q2          @ AES block 4k+6 - round 4
   6001 	mov	d6, v7.d[1]                                  @ GHASH block 4k+3 - mid
   6002 
   6003 	aese	q1, v22.16b
   6004 	aesmc	q1, q1          @ AES block 4k+5 - round 4
   6005 
   6006 	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
   6007 
   6008 	aese	q2, v23.16b
   6009 	aesmc	q2, q2          @ AES block 4k+6 - round 5
   6010 	eor	q6, q6, q7                          @ GHASH block 4k+3 - mid
   6011 
   6012 	aese	q1, v23.16b
   6013 	aesmc	q1, q1          @ AES block 4k+5 - round 5
   6014 
   6015 	aese	q3, v24.16b
   6016 	aesmc	q3, q3          @ AES block 4k+7 - round 6
   6017 	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
   6018 
   6019 	aese	q2, v24.16b
   6020 	aesmc	q2, q2          @ AES block 4k+6 - round 6
   6021 
   6022 	aese	q0, v24.16b
   6023 	aesmc	q0, q0          @ AES block 4k+4 - round 6
   6024 	movi	q8, #0xc2
   6025 
   6026 	aese	q1, v24.16b
   6027 	aesmc	q1, q1          @ AES block 4k+5 - round 6
   6028 	eor	v11.16b, v11.16b, q4                         @ GHASH block 4k+3 - low
   6029 
   6030 	pmull	v6.1q, q6, v16.1d                          @ GHASH block 4k+3 - mid
   6031 
   6032 	aese	q3, v25.16b
   6033 	aesmc	q3, q3          @ AES block 4k+7 - round 7
   6034 	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
   6035 
   6036 	aese	q1, v25.16b
   6037 	aesmc	q1, q1          @ AES block 4k+5 - round 7
   6038 
   6039 	aese	q0, v25.16b
   6040 	aesmc	q0, q0          @ AES block 4k+4 - round 7
   6041 	eor	v10.16b, v10.16b, q6                         @ GHASH block 4k+3 - mid
   6042 
   6043 	aese	q3, v26.16b
   6044 	aesmc	q3, q3          @ AES block 4k+7 - round 8
   6045 
   6046 	aese	q2, v25.16b
   6047 	aesmc	q2, q2          @ AES block 4k+6 - round 7
   6048 	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
   6049 
   6050 	aese	q1, v26.16b
   6051 	aesmc	q1, q1          @ AES block 4k+5 - round 8
   6052 
   6053 	aese	q0, v26.16b
   6054 	aesmc	q0, q0          @ AES block 4k+4 - round 8
   6055 	shl	d8, d8, #56               @ mod_constant
   6056 
   6057 	aese	q2, v26.16b
   6058 	aesmc	q2, q2          @ AES block 4k+6 - round 8
   6059 
   6060 	aese	q1, v27.16b
   6061 	aesmc	q1, q1          @ AES block 4k+5 - round 9
   6062 	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
   6063 
   6064 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   6065 
   6066 	aese	q2, v27.16b
   6067 	aesmc	q2, q2          @ AES block 4k+6 - round 9
   6068 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   6069 
   6070 	aese	q3, v27.16b
   6071 	aesmc	q3, q3          @ AES block 4k+7 - round 9
   6072 
   6073 	aese	q0, v27.16b
   6074 	aesmc	q0, q0          @ AES block 4k+4 - round 9
   6075 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   6076 
   6077 	aese	q2, v28.16b
   6078 	aesmc	q2, q2          @ AES block 4k+6 - round 10
   6079 
   6080 	aese	q3, v28.16b
   6081 	aesmc	q3, q3          @ AES block 4k+7 - round 10
   6082 
   6083 	aese	q0, v28.16b
   6084 	aesmc	q0, q0          @ AES block 4k+4 - round 10
   6085 	eor	r22, r22, r14                   @ AES block 4k+2 - round 14 high
   6086 #ifdef __ARMEB__
   6087 	rev	r22, r22
   6088 #endif
   6089 	aese	q1, v28.16b
   6090 	aesmc	q1, q1          @ AES block 4k+5 - round 10
   6091 	eor	r23, r23, r13                   @ AES block 4k+3 - round 14 low
   6092 #ifdef __ARMEB__
   6093 	rev	r23, r23
   6094 #endif
   6095 	aese	q2, v29.16b
   6096 	aesmc	q2, q2          @ AES block 4k+6 - round 11
   6097 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   6098 
   6099 	aese	q0, v29.16b
   6100 	aesmc	q0, q0          @ AES block 4k+4 - round 11
   6101 	add	r12, r12, #1                            @ CTR block 4k+7
   6102 
   6103 	aese	q1, v29.16b
   6104 	aesmc	q1, q1          @ AES block 4k+5 - round 11
   6105 	eor	r21, r21, r13                   @ AES block 4k+2 - round 14 low
   6106 #ifdef __ARMEB__
   6107 	rev	r21, r21
   6108 #endif
   6109 
   6110 	aese	q2, v30.16b
   6111 	aesmc	q2, q2          @ AES block 4k+6 - round 12
   6112 
   6113 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   6114 	eor	r24, r24, r14                   @ AES block 4k+3 - round 14 high
   6115 #ifdef __ARMEB__
   6116 	rev	r24, r24
   6117 #endif
   6118 
   6119 	aese	q3, v29.16b
   6120 	aesmc	q3, q3          @ AES block 4k+7 - round 11
   6121 	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
   6122 
   6123 	aese	q1, v30.16b
   6124 	aesmc	q1, q1          @ AES block 4k+5 - round 12
   6125 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   6126 
   6127 	aese	q0, v30.16b
   6128 	aesmc	q0, q0          @ AES block 4k+4 - round 12
   6129 	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
   6130 
   6131 	aese	q3, v30.16b
   6132 	aesmc	q3, q3          @ AES block 4k+7 - round 12
   6133 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   6134 
   6135 	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
   6136 
   6137 	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
   6138 
   6139 	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
   6140 
   6141 	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
   6142 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   6143 .L256_dec_tail:@ TAIL
   6144 
   6145 	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
   6146 	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
   6147 
   6148 	eor	q0, q5, q0                            @ AES block 4k+4 - result
   6149 
   6150 	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
   6151 
   6152 	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
   6153 	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
   6154 
   6155 	cmp	r5, #48
   6156 
   6157 	eor	r6, r6, r13                   @ AES block 4k+4 - round 14 low
   6158 #ifdef __ARMEB__
   6159 	rev	r6, r6
   6160 #endif
   6161 
   6162 	eor	r7, r7, r14                   @ AES block 4k+4 - round 14 high
   6163 #ifdef __ARMEB__
   6164 	rev	r7, r7
   6165 #endif
   6166 	bgt	.L256_dec_blocks_more_than_3
   6167 
   6168 	sub	r12, r12, #1
   6169 	mov	q3, q2
   6170 	movi	v10.8b, #0
   6171 
   6172 	movi	v11.8b, #0
   6173 	cmp	r5, #32
   6174 
   6175 	movi	q9, #0
   6176 	mov	q2, q1
   6177 	bgt	.L256_dec_blocks_more_than_2
   6178 
   6179 	sub	r12, r12, #1
   6180 
   6181 	mov	q3, q1
   6182 	cmp	r5, #16
   6183 	bgt	.L256_dec_blocks_more_than_1
   6184 
   6185 	sub	r12, r12, #1
   6186 	b	.L256_dec_blocks_less_than_1
   6187 .L256_dec_blocks_more_than_3:@ blocks left >  3
   6188 	rev64	q4, q5                                   @ GHASH final-3 block
   6189 	ld1	{ q5}, [r0], #16                     @ AES final-2 block - load ciphertext
   6190 
   6191 	stp	r6, r7, [r2], #16       @ AES final-3 block  - store result
   6192 
   6193 	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
   6194 
   6195 	eor	q4, q4, q8                          @ feed in partial tag
   6196 
   6197 	eor	q0, q5, q1                           @ AES final-2 block - result
   6198 
   6199 	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
   6200 
   6201 	mov	r6, v0.d[0]                           @ AES final-2 block - mov low
   6202 
   6203 	mov	r7, v0.d[1]                           @ AES final-2 block - mov high
   6204 
   6205 	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
   6206 
   6207 	movi	q8, #0                                       @ suppress further partial tag feed in
   6208 
   6209 	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
   6210 
   6211 	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
   6212 	eor	r6, r6, r13                  @ AES final-2 block - round 14 low
   6213 #ifdef __ARMEB__
   6214 	rev	r6, r6
   6215 #endif
   6216 
   6217 	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
   6218 	eor	r7, r7, r14                  @ AES final-2 block - round 14 high
   6219 #ifdef __ARMEB__
   6220 	rev	r7, r7
   6221 #endif
   6222 .L256_dec_blocks_more_than_2:@ blocks left >  2
   6223 
   6224 	rev64	q4, q5                                   @ GHASH final-2 block
   6225 	ld1	{ q5}, [r0], #16                     @ AES final-1 block - load ciphertext
   6226 
   6227 	eor	q4, q4, q8                          @ feed in partial tag
   6228 	stp	r6, r7, [r2], #16       @ AES final-2 block  - store result
   6229 
   6230 	eor	q0, q5, q2                           @ AES final-1 block - result
   6231 
   6232 	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
   6233 
   6234 	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
   6235 
   6236 	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
   6237 
   6238 	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
   6239 	mov	r6, v0.d[0]                           @ AES final-1 block - mov low
   6240 
   6241 	mov	r7, v0.d[1]                           @ AES final-1 block - mov high
   6242 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
   6243 	movi	q8, #0                                       @ suppress further partial tag feed in
   6244 
   6245 	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
   6246 
   6247 	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
   6248 	eor	r6, r6, r13                  @ AES final-1 block - round 14 low
   6249 #ifdef __ARMEB__
   6250 	rev	r6, r6
   6251 #endif
   6252 
   6253 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
   6254 	eor	r7, r7, r14                  @ AES final-1 block - round 14 high
   6255 #ifdef __ARMEB__
   6256 	rev	r7, r7
   6257 #endif
   6258 .L256_dec_blocks_more_than_1:@ blocks left >  1
   6259 
   6260 	stp	r6, r7, [r2], #16       @ AES final-1 block  - store result
   6261 	rev64	q4, q5                                   @ GHASH final-1 block
   6262 
   6263 	ld1	{ q5}, [r0], #16                     @ AES final block - load ciphertext
   6264 
   6265 	eor	q4, q4, q8                          @ feed in partial tag
   6266 	movi	q8, #0                                       @ suppress further partial tag feed in
   6267 
   6268 	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
   6269 
   6270 	eor	q0, q5, q3                           @ AES final block - result
   6271 
   6272 	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
   6273 
   6274 	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
   6275 
   6276 	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
   6277 	mov	r6, v0.d[0]                           @ AES final block - mov low
   6278 
   6279 	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
   6280 
   6281 	mov	r7, v0.d[1]                           @ AES final block - mov high
   6282 
   6283 	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
   6284 	eor	r6, r6, r13                  @ AES final block - round 14 low
   6285 #ifdef __ARMEB__
   6286 	rev	r6, r6
   6287 #endif
   6288 	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
   6289 
   6290 	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
   6291 
   6292 	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
   6293 	eor	r7, r7, r14                  @ AES final block - round 14 high
   6294 #ifdef __ARMEB__
   6295 	rev	r7, r7
   6296 #endif
   6297 .L256_dec_blocks_less_than_1:@ blocks left <= 1
   6298 
   6299 	and	r1, r1, #127                   @ bit_length %= 128
   6300 	mvn	r14, xzr                                     @ rk14_h = 0xffffffffffffffff
   6301 
   6302 	sub	r1, r1, #128                   @ bit_length -= 128
   6303 	mvn	r13, xzr                                     @ rk14_l = 0xffffffffffffffff
   6304 
   6305 	ldp	r4, r5, [r2] @ load existing bytes we need to not overwrite
   6306 	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
   6307 
   6308 	and	r1, r1, #127                   @ bit_length %= 128
   6309 
   6310 	lsr	r14, r14, r1                    @ rk14_h is mask for top 64b of last block
   6311 	cmp	r1, #64
   6312 
   6313 	csel	r9, r13, r14, lt
   6314 	csel	r10, r14, xzr, lt
   6315 
   6316 	fmov	d0, r9                                  @ ctr0b is mask for last block
   6317 	and	r6, r6, r9
   6318 
   6319 	mov	v0.d[1], r10
   6320 	bic	r4, r4, r9          @ mask out low existing bytes
   6321 
   6322 #ifndef __ARMEB__
   6323 	rev	r9, r12
   6324 #else
   6325 	mov	r9, r12
   6326 #endif
   6327 
   6328 	bic	r5, r5, r10      @ mask out high existing bytes
   6329 
   6330 	orr	r6, r6, r4
   6331 
   6332 	and	r7, r7, r10
   6333 
   6334 	orr	r7, r7, r5
   6335 
   6336 	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
   6337 
   6338 	rev64	q4, q5                                    @ GHASH final block
   6339 
   6340 	eor	q4, q4, q8                           @ feed in partial tag
   6341 
   6342 	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
   6343 
   6344 	mov	d8, v4.d[1]                                  @ GHASH final block - mid
   6345 
   6346 	eor	q8, q8, q4                          @ GHASH final block - mid
   6347 
   6348 	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
   6349 
   6350 	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
   6351 
   6352 	eor	q9, q9, v20.16b                            @ GHASH final block - high
   6353 
   6354 	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
   6355 
   6356 	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
   6357 	movi	q8, #0xc2
   6358 
   6359 	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
   6360 
   6361 	shl	d8, d8, #56               @ mod_constant
   6362 
   6363 	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
   6364 
   6365 	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
   6366 
   6367 	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
   6368 
   6369 	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
   6370 
   6371 	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
   6372 
   6373 	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
   6374 
   6375 	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
   6376 
   6377 	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
   6378 
   6379 	stp	r6, r7, [r2]
   6380 
   6381 	str	r9, [r16, #12]                          @ store the updated counter
   6382 
   6383 	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
   6384 	ext	v11.16b, v11.16b, v11.16b, #8
   6385 	rev64	v11.16b, v11.16b
   6386 	mov	r0, r15
   6387 	st1	{ v11.16b }, [r3]
   6388 
   6389 	ldp	r21, r22, [sp, #16]
   6390 	ldp	r23, r24, [sp, #32]
   6391 	ldp	d8, d9, [sp, #48]
   6392 	ldp	d10, d11, [sp, #64]
   6393 	ldp	d12, d13, [sp, #80]
   6394 	ldp	d14, d15, [sp, #96]
   6395 	ldp	r19, r20, [sp], #112
   6396 	RET
   6397 
   6398 .L256_dec_ret:
   6399 	mov	r0, #0x0
   6400 	RET
   6401 .size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
   6402 .section	.rodata
   6403 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   6404 .align	2
   6405 .align	2
   6406 #endif
   6407