Home | History | Annotate | Line # | Download | only in aarch64
      1 #include "arm_arch.h"
      2 
      3 #if __ARM_MAX_ARCH__>=8
      4 .arch	armv8-a+crypto
      5 .text
      6 .globl	aes_gcm_enc_128_kernel
      7 .type	aes_gcm_enc_128_kernel,%function
      8 .align	4
      9 aes_gcm_enc_128_kernel:
     10 	AARCH64_VALID_CALL_TARGET
     11 	cbz	x1, .L128_enc_ret
     12 	stp	x19, x20, [sp, #-112]!
     13 	mov	x16, x4
     14 	mov	x8, x5
     15 	stp	x21, x22, [sp, #16]
     16 	stp	x23, x24, [sp, #32]
     17 	stp	d8, d9, [sp, #48]
     18 	stp	d10, d11, [sp, #64]
     19 	stp	d12, d13, [sp, #80]
     20 	stp	d14, d15, [sp, #96]
     21 
     22 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
     23 #ifdef __AARCH64EB__
     24 	rev	x10, x10
     25 	rev	x11, x11
     26 #endif
     27 	ldp	x13, x14, [x8, #160]                     //load rk10
     28 #ifdef __AARCH64EB__
     29 	ror	x13, x13, #32
     30 	ror	x14, x14, #32
     31 #endif
     32 	ld1	{v11.16b}, [x3]
     33 	ext	v11.16b, v11.16b, v11.16b, #8
     34 	rev64	v11.16b, v11.16b
     35 	lsr	x5, x1, #3              //byte_len
     36 	mov	x15, x5
     37 
     38 	ld1	{v18.4s}, [x8], #16								  //load rk0
     39 	add	x4, x0, x1, lsr #3   //end_input_ptr
     40 	sub	x5, x5, #1      //byte_len - 1
     41 
     42 	lsr	x12, x11, #32
     43 	ldr	q15, [x3, #112]                        //load h4l | h4h
     44 #ifndef __AARCH64EB__
     45 	ext	v15.16b, v15.16b, v15.16b, #8
     46 #endif
     47 	fmov	d1, x10                               //CTR block 1
     48 	rev	w12, w12                                //rev_ctr32
     49 
     50 	add	w12, w12, #1                            //increment rev_ctr32
     51 	orr	w11, w11, w11
     52 	ld1	{v19.4s}, [x8], #16								  //load rk1
     53 
     54 	rev	w9, w12                                 //CTR block 1
     55 	add	w12, w12, #1                            //CTR block 1
     56 	fmov	d3, x10                               //CTR block 3
     57 
     58 	orr	x9, x11, x9, lsl #32            //CTR block 1
     59 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
     60 
     61 	fmov	v1.d[1], x9                               //CTR block 1
     62 	rev	w9, w12                                 //CTR block 2
     63 
     64 	fmov	d2, x10                               //CTR block 2
     65 	orr	x9, x11, x9, lsl #32            //CTR block 2
     66 	add	w12, w12, #1                            //CTR block 2
     67 
     68 	fmov	v2.d[1], x9                               //CTR block 2
     69 	rev	w9, w12                                 //CTR block 3
     70 
     71 	orr	x9, x11, x9, lsl #32            //CTR block 3
     72 	ld1	{v20.4s}, [x8], #16								  //load rk2
     73 
     74 	add	w12, w12, #1                            //CTR block 3
     75 	fmov	v3.d[1], x9                               //CTR block 3
     76 
     77 	ldr	q14, [x3, #80]                         //load h3l | h3h
     78 #ifndef __AARCH64EB__
     79 	ext	v14.16b, v14.16b, v14.16b, #8
     80 #endif
     81 	aese	v1.16b, v18.16b
     82 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
     83 	ld1	{v21.4s}, [x8], #16								  //load rk3
     84 
     85 	aese	v2.16b, v18.16b
     86 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
     87 	ldr	q12, [x3, #32]                         //load h1l | h1h
     88 #ifndef __AARCH64EB__
     89 	ext	v12.16b, v12.16b, v12.16b, #8
     90 #endif
     91 
     92 	aese	v0.16b, v18.16b
     93 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
     94 	ld1	{v22.4s}, [x8], #16								  //load rk4
     95 
     96 	aese	v3.16b, v18.16b
     97 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
     98 	ld1	{v23.4s}, [x8], #16								  //load rk5
     99 
    100 	aese	v2.16b, v19.16b
    101 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
    102 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
    103 
    104 	aese	v0.16b, v19.16b
    105 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
    106 	ld1	{v24.4s}, [x8], #16								  //load rk6
    107 
    108 	aese	v1.16b, v19.16b
    109 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
    110 	ld1	{v25.4s}, [x8], #16								  //load rk7
    111 
    112 	aese	v3.16b, v19.16b
    113 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
    114 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
    115 
    116 	aese	v0.16b, v20.16b
    117 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
    118 	ld1	{v26.4s}, [x8], #16								  //load rk8
    119 
    120 	aese	v1.16b, v20.16b
    121 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
    122 	ldr	q13, [x3, #64]                         //load h2l | h2h
    123 #ifndef __AARCH64EB__
    124 	ext	v13.16b, v13.16b, v13.16b, #8
    125 #endif
    126 
    127 	aese	v3.16b, v20.16b
    128 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
    129 
    130 	aese	v2.16b, v20.16b
    131 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
    132 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
    133 
    134 	aese	v0.16b, v21.16b
    135 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
    136 
    137 	aese	v1.16b, v21.16b
    138 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
    139 
    140 	aese	v2.16b, v21.16b
    141 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
    142 	ld1	{v27.4s}, [x8], #16								  //load rk9
    143 
    144 	aese	v3.16b, v21.16b
    145 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
    146 
    147 	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
    148 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
    149 
    150 	aese	v3.16b, v22.16b
    151 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
    152 	add	x5, x5, x0
    153 
    154 	aese	v2.16b, v22.16b
    155 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
    156 	cmp	x0, x5                   //check if we have <= 4 blocks
    157 
    158 	aese	v0.16b, v22.16b
    159 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
    160 
    161 	aese	v3.16b, v23.16b
    162 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
    163 
    164 	aese	v2.16b, v23.16b
    165 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
    166 
    167 	aese	v0.16b, v23.16b
    168 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
    169 
    170 	aese	v3.16b, v24.16b
    171 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
    172 
    173 	aese	v1.16b, v22.16b
    174 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
    175 
    176 	aese	v2.16b, v24.16b
    177 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
    178 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
    179 
    180 	aese	v0.16b, v24.16b
    181 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
    182 
    183 	aese	v1.16b, v23.16b
    184 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
    185 
    186 	aese	v3.16b, v25.16b
    187 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
    188 
    189 	aese	v0.16b, v25.16b
    190 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
    191 
    192 	aese	v1.16b, v24.16b
    193 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
    194 
    195 	aese	v2.16b, v25.16b
    196 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
    197 
    198 	aese	v0.16b, v26.16b
    199 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
    200 
    201 	aese	v1.16b, v25.16b
    202 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
    203 
    204 	aese	v2.16b, v26.16b
    205 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
    206 
    207 	aese	v3.16b, v26.16b
    208 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
    209 
    210 	aese	v1.16b, v26.16b
    211 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
    212 
    213 	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
    214 
    215 	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
    216 
    217 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
    218 
    219 	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
    220 
    221 	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
    222 	b.ge	.L128_enc_tail                                    //handle tail
    223 
    224 	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
    225 #ifdef __AARCH64EB__
    226 	rev	x6, x6
    227 	rev	x7, x7
    228 #endif
    229 	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
    230 #ifdef __AARCH64EB__
    231 	rev	x21, x21
    232 	rev	x22, x22
    233 #endif
    234 	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
    235 #ifdef __AARCH64EB__
    236 	rev	x19, x19
    237 	rev	x20, x20
    238 #endif
    239 	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
    240 #ifdef __AARCH64EB__
    241 	rev	x23, x23
    242 	rev	x24, x24
    243 #endif
    244 	eor	x6, x6, x13                     //AES block 0 - round 10 low
    245 	eor	x7, x7, x14                     //AES block 0 - round 10 high
    246 
    247 	eor	x21, x21, x13                     //AES block 2 - round 10 low
    248 	fmov	d4, x6                               //AES block 0 - mov low
    249 
    250 	eor	x19, x19, x13                     //AES block 1 - round 10 low
    251 	eor	x22, x22, x14                     //AES block 2 - round 10 high
    252 	fmov	v4.d[1], x7                           //AES block 0 - mov high
    253 
    254 	fmov	d5, x19                               //AES block 1 - mov low
    255 	eor	x20, x20, x14                     //AES block 1 - round 10 high
    256 
    257 	eor	x23, x23, x13                     //AES block 3 - round 10 low
    258 	fmov	v5.d[1], x20                           //AES block 1 - mov high
    259 
    260 	fmov	d6, x21                               //AES block 2 - mov low
    261 	eor	x24, x24, x14                     //AES block 3 - round 10 high
    262 	rev	w9, w12                                 //CTR block 4
    263 
    264 	fmov	v6.d[1], x22                           //AES block 2 - mov high
    265 	orr	x9, x11, x9, lsl #32            //CTR block 4
    266 
    267 	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
    268 	fmov	d0, x10                               //CTR block 4
    269 	add	w12, w12, #1                            //CTR block 4
    270 
    271 	fmov	v0.d[1], x9                               //CTR block 4
    272 	rev	w9, w12                                 //CTR block 5
    273 
    274 	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
    275 	fmov	d1, x10                               //CTR block 5
    276 	orr	x9, x11, x9, lsl #32            //CTR block 5
    277 
    278 	add	w12, w12, #1                            //CTR block 5
    279 	add	x0, x0, #64                       //AES input_ptr update
    280 	fmov	v1.d[1], x9                               //CTR block 5
    281 
    282 	fmov	d7, x23                               //AES block 3 - mov low
    283 	rev	w9, w12                                 //CTR block 6
    284 	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
    285 
    286 	fmov	v7.d[1], x24                           //AES block 3 - mov high
    287 	orr	x9, x11, x9, lsl #32            //CTR block 6
    288 
    289 	add	w12, w12, #1                            //CTR block 6
    290 	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
    291 	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
    292 
    293 	fmov	d2, x10                               //CTR block 6
    294 	cmp	x0, x5                   //check if we have <= 8 blocks
    295 
    296 	fmov	v2.d[1], x9                               //CTR block 6
    297 	rev	w9, w12                                 //CTR block 7
    298 	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
    299 
    300 	orr	x9, x11, x9, lsl #32            //CTR block 7
    301 
    302 	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
    303 	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
    304 	b.ge	.L128_enc_prepretail                              //do prepretail
    305 
    306 .L128_enc_main_loop:	//main	loop start
    307 	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
    308 #ifdef __AARCH64EB__
    309 	rev	x23, x23
    310 	rev	x24, x24
    311 #endif
    312 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
    313 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
    314 
    315 	aese	v2.16b, v18.16b
    316 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
    317 	fmov	d3, x10                               //CTR block 4k+3
    318 
    319 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
    320 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
    321 
    322 	aese	v1.16b, v18.16b
    323 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
    324 	add	w12, w12, #1                            //CTR block 4k+3
    325 	fmov	v3.d[1], x9                               //CTR block 4k+3
    326 
    327 	aese	v0.16b, v18.16b
    328 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
    329 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
    330 
    331 	aese	v2.16b, v19.16b
    332 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
    333 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
    334 
    335 	aese	v1.16b, v19.16b
    336 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
    337 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
    338 
    339 	aese	v3.16b, v18.16b
    340 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
    341 	eor	x24, x24, x14                     //AES block 4k+3 - round 10 high
    342 
    343 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
    344 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
    345 	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
    346 #ifdef __AARCH64EB__
    347 	rev	x6, x6
    348 	rev	x7, x7
    349 #endif
    350 	aese	v0.16b, v19.16b
    351 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
    352 	rev	w9, w12                                 //CTR block 4k+8
    353 
    354 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
    355 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
    356 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
    357 
    358 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
    359 	add	w12, w12, #1                            //CTR block 4k+8
    360 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
    361 
    362 	aese	v0.16b, v20.16b
    363 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
    364 
    365 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
    366 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
    367 
    368 	aese	v1.16b, v20.16b
    369 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
    370 
    371 	aese	v0.16b, v21.16b
    372 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
    373 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
    374 
    375 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
    376 
    377 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
    378 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
    379 
    380 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
    381 
    382 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
    383 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
    384 
    385 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
    386 	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
    387 
    388 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
    389 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
    390 
    391 	aese	v3.16b, v19.16b
    392 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
    393 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
    394 
    395 	aese	v2.16b, v20.16b
    396 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
    397 	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
    398 
    399 	aese	v1.16b, v21.16b
    400 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
    401 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
    402 
    403 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
    404 
    405 	aese	v2.16b, v21.16b
    406 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
    407 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
    408 
    409 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
    410 
    411 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
    412 	movi	v8.8b, #0xc2
    413 
    414 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
    415 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
    416 
    417 	aese	v1.16b, v22.16b
    418 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
    419 
    420 	aese	v3.16b, v20.16b
    421 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
    422 	shl	d8, d8, #56               //mod_constant
    423 
    424 	aese	v0.16b, v22.16b
    425 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
    426 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
    427 
    428 	aese	v1.16b, v23.16b
    429 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
    430 	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
    431 #ifdef __AARCH64EB__
    432 	rev	x19, x19
    433 	rev	x20, x20
    434 #endif
    435 	aese	v3.16b, v21.16b
    436 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
    437 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
    438 
    439 	aese	v0.16b, v23.16b
    440 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
    441 	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
    442 #ifdef __AARCH64EB__
    443 	rev	x21, x21
    444 	rev	x22, x22
    445 #endif
    446 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
    447 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
    448 
    449 	aese	v2.16b, v22.16b
    450 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
    451 	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
    452 
    453 	aese	v3.16b, v22.16b
    454 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
    455 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
    456 
    457 	aese	v1.16b, v24.16b
    458 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
    459 	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
    460 
    461 	aese	v2.16b, v23.16b
    462 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
    463 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
    464 
    465 	fmov	d4, x6                               //AES block 4k+4 - mov low
    466 	aese	v0.16b, v24.16b
    467 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
    468 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
    469 
    470 	add	x0, x0, #64                       //AES input_ptr update
    471 	fmov	d7, x23                               //AES block 4k+3 - mov low
    472 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
    473 
    474 	aese	v3.16b, v23.16b
    475 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
    476 	fmov	d5, x19                               //AES block 4k+5 - mov low
    477 
    478 	aese	v0.16b, v25.16b
    479 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
    480 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
    481 
    482 	aese	v2.16b, v24.16b
    483 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
    484 	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
    485 
    486 	aese	v1.16b, v25.16b
    487 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
    488 	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
    489 
    490 	aese	v0.16b, v26.16b
    491 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
    492 	fmov	v7.d[1], x24                           //AES block 4k+3 - mov high
    493 
    494 	aese	v3.16b, v24.16b
    495 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
    496 	cmp	x0, x5                   //.LOOP CONTROL
    497 
    498 	aese	v1.16b, v26.16b
    499 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
    500 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
    501 
    502 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
    503 	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
    504 	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
    505 
    506 	aese	v3.16b, v25.16b
    507 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
    508 	fmov	d6, x21                               //AES block 4k+6 - mov low
    509 
    510 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
    511 	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
    512 
    513 	aese	v2.16b, v25.16b
    514 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
    515 	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
    516 
    517 	fmov	d0, x10                               //CTR block 4k+8
    518 	aese	v3.16b, v26.16b
    519 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
    520 
    521 	fmov	v0.d[1], x9                               //CTR block 4k+8
    522 	rev	w9, w12                                 //CTR block 4k+9
    523 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
    524 
    525 	aese	v2.16b, v26.16b
    526 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
    527 	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
    528 
    529 	add	w12, w12, #1                            //CTR block 4k+9
    530 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
    531 	fmov	d1, x10                               //CTR block 4k+9
    532 
    533 	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
    534 	fmov	v1.d[1], x9                               //CTR block 4k+9
    535 	rev	w9, w12                                 //CTR block 4k+10
    536 
    537 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
    538 	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
    539 	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
    540 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
    541 
    542 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
    543 	add	w12, w12, #1                            //CTR block 4k+10
    544 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
    545 	fmov	d2, x10                               //CTR block 4k+10
    546 
    547 	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
    548 	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
    549 
    550 	fmov	v2.d[1], x9                               //CTR block 4k+10
    551 	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
    552 	rev	w9, w12                                 //CTR block 4k+11
    553 
    554 	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
    555 	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result
    556 
    557 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
    558 	st1	{ v7.16b}, [x2], #16                     //AES block 4k+3 - store result
    559 	b.lt	.L128_enc_main_loop
    560 
    561 .L128_enc_prepretail:	//PREPRETAIL
    562 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
    563 	fmov	d3, x10                               //CTR block 4k+3
    564 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
    565 
    566 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
    567 	add	w12, w12, #1                            //CTR block 4k+3
    568 	fmov	v3.d[1], x9                               //CTR block 4k+3
    569 
    570 	aese	v1.16b, v18.16b
    571 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
    572 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
    573 
    574 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
    575 
    576 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
    577 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
    578 
    579 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
    580 
    581 	aese	v3.16b, v18.16b
    582 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
    583 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
    584 
    585 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
    586 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
    587 
    588 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
    589 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
    590 
    591 	aese	v1.16b, v19.16b
    592 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
    593 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
    594 
    595 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
    596 
    597 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
    598 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
    599 
    600 	aese	v3.16b, v19.16b
    601 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
    602 
    603 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
    604 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
    605 
    606 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
    607 
    608 	aese	v0.16b, v18.16b
    609 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
    610 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
    611 
    612 	aese	v2.16b, v18.16b
    613 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
    614 
    615 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
    616 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
    617 
    618 	aese	v0.16b, v19.16b
    619 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
    620 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
    621 
    622 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
    623 
    624 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
    625 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
    626 
    627 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
    628 
    629 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
    630 
    631 	aese	v2.16b, v19.16b
    632 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
    633 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
    634 
    635 	aese	v0.16b, v20.16b
    636 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
    637 
    638 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
    639 	movi	v8.8b, #0xc2
    640 
    641 	aese	v2.16b, v20.16b
    642 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
    643 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
    644 
    645 	aese	v3.16b, v20.16b
    646 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
    647 
    648 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
    649 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
    650 
    651 	aese	v2.16b, v21.16b
    652 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
    653 
    654 	aese	v1.16b, v20.16b
    655 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
    656 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
    657 
    658 	aese	v0.16b, v21.16b
    659 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
    660 
    661 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
    662 	shl	d8, d8, #56               //mod_constant
    663 
    664 	aese	v1.16b, v21.16b
    665 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
    666 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
    667 
    668 	aese	v0.16b, v22.16b
    669 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
    670 
    671 	pmull	v28.1q, v9.1d, v8.1d
    672 	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
    673 
    674 	aese	v1.16b, v22.16b
    675 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
    676 
    677 	aese	v0.16b, v23.16b
    678 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
    679 	ext	v9.16b, v9.16b, v9.16b, #8
    680 
    681 	aese	v3.16b, v21.16b
    682 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
    683 
    684 	aese	v2.16b, v22.16b
    685 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
    686 	eor	v10.16b, v10.16b, v11.16b
    687 
    688 	aese	v0.16b, v24.16b
    689 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
    690 
    691 	aese	v3.16b, v22.16b
    692 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
    693 
    694 	aese	v1.16b, v23.16b
    695 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
    696 
    697 	aese	v2.16b, v23.16b
    698 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
    699 	eor	v10.16b, v10.16b, v28.16b
    700 
    701 	aese	v3.16b, v23.16b
    702 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
    703 
    704 	aese	v1.16b, v24.16b
    705 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
    706 
    707 	aese	v2.16b, v24.16b
    708 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
    709 
    710 	aese	v3.16b, v24.16b
    711 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
    712 	eor	v10.16b, v10.16b, v9.16b
    713 
    714 	aese	v0.16b, v25.16b
    715 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
    716 
    717 	aese	v2.16b, v25.16b
    718 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
    719 
    720 	aese	v3.16b, v25.16b
    721 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
    722 
    723 	pmull	v28.1q, v10.1d, v8.1d
    724 
    725 	aese	v1.16b, v25.16b
    726 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
    727 	ext	v10.16b, v10.16b, v10.16b, #8
    728 
    729 	aese	v3.16b, v26.16b
    730 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
    731 
    732 	aese	v0.16b, v26.16b
    733 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
    734 	eor	v11.16b, v11.16b, v28.16b
    735 
    736 	aese	v1.16b, v26.16b
    737 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
    738 
    739 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
    740 
    741 	aese	v2.16b, v26.16b
    742 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
    743 
    744 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
    745 
    746 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
    747 	eor	v11.16b, v11.16b, v10.16b
    748 
    749 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
    750 .L128_enc_tail:	//TAIL
    751 
    752 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
    753 	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
    754 #ifdef __AARCH64EB__
    755 	rev	x6, x6
    756 	rev	x7, x7
    757 #endif
    758 	cmp	x5, #48
    759 
    760 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
    761 	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
    762 	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
    763 
    764 	fmov	d4, x6                               //AES block 4k+4 - mov low
    765 
    766 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
    767 
    768 	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
    769 
    770 	b.gt	.L128_enc_blocks_more_than_3
    771 
    772 	sub	w12, w12, #1
    773 	movi	v11.8b, #0
    774 	mov	v3.16b, v2.16b
    775 
    776 	cmp	x5, #32
    777 	mov	v2.16b, v1.16b
    778 	movi	v9.8b, #0
    779 
    780 	movi	v10.8b, #0
    781 	b.gt	.L128_enc_blocks_more_than_2
    782 
    783 	mov	v3.16b, v1.16b
    784 	cmp	x5, #16
    785 
    786 	sub	w12, w12, #1
    787 	b.gt	.L128_enc_blocks_more_than_1
    788 
    789 	sub	w12, w12, #1
    790 	b	.L128_enc_blocks_less_than_1
    791 .L128_enc_blocks_more_than_3:	//blocks	left >  3
    792 	st1	{ v5.16b}, [x2], #16                     //AES final-3 block  - store result
    793 
    794 	ldp	x6, x7, [x0], #16           //AES final-2 block - load input low & high
    795 #ifdef __AARCH64EB__
    796 	rev	x6, x6
    797 	rev	x7, x7
    798 #endif
    799 	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
    800 
    801 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    802 	eor	x7, x7, x14                     //AES final-2 block - round 10 high
    803 	eor	x6, x6, x13                     //AES final-2 block - round 10 low
    804 
    805 	fmov	d5, x6                                 //AES final-2 block - mov low
    806 
    807 	movi	v8.8b, #0                                        //suppress further partial tag feed in
    808 	fmov	v5.d[1], x7                             //AES final-2 block - mov high
    809 
    810 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
    811 	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
    812 
    813 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
    814 
    815 	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
    816 
    817 	eor	v5.16b, v5.16b, v1.16b                            //AES final-2 block - result
    818 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
    819 
    820 	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
    821 .L128_enc_blocks_more_than_2:	//blocks	left >  2
    822 
    823 	st1	{ v5.16b}, [x2], #16                     //AES final-2 block - store result
    824 
    825 	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
    826 	ldp	x6, x7, [x0], #16           //AES final-1 block - load input low & high
    827 #ifdef __AARCH64EB__
    828 	rev	x6, x6
    829 	rev	x7, x7
    830 #endif
    831 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    832 
    833 	eor	x6, x6, x13                     //AES final-1 block - round 10 low
    834 
    835 	fmov	d5, x6                                 //AES final-1 block - mov low
    836 	eor	x7, x7, x14                     //AES final-1 block - round 10 high
    837 
    838 	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
    839 	fmov	v5.d[1], x7                             //AES final-1 block - mov high
    840 
    841 	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
    842 
    843 	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
    844 
    845 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
    846 
    847 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
    848 
    849 	eor	v5.16b, v5.16b, v2.16b                            //AES final-1 block - result
    850 
    851 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
    852 
    853 	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
    854 
    855 	movi	v8.8b, #0                                        //suppress further partial tag feed in
    856 
    857 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
    858 .L128_enc_blocks_more_than_1:	//blocks	left >  1
    859 
    860 	st1	{ v5.16b}, [x2], #16                     //AES final-1 block - store result
    861 
    862 	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
    863 	ldp	x6, x7, [x0], #16           //AES final block - load input low & high
    864 #ifdef __AARCH64EB__
    865 	rev	x6, x6
    866 	rev	x7, x7
    867 #endif
    868 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    869 
    870 	eor	x7, x7, x14                     //AES final block - round 10 high
    871 	eor	x6, x6, x13                     //AES final block - round 10 low
    872 
    873 	fmov	d5, x6                                 //AES final block - mov low
    874 
    875 	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
    876 	fmov	v5.d[1], x7                             //AES final block - mov high
    877 
    878 	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
    879 
    880 	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
    881 
    882 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
    883 
    884 	eor	v5.16b, v5.16b, v3.16b                            //AES final block - result
    885 
    886 	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
    887 
    888 	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
    889 
    890 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
    891 
    892 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
    893 
    894 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
    895 	movi	v8.8b, #0                                        //suppress further partial tag feed in
    896 .L128_enc_blocks_less_than_1:	//blocks	left <= 1
    897 
    898 	and	x1, x1, #127                    //bit_length %= 128
    899 	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
    900 
    901 	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
    902 	sub	x1, x1, #128                    //bit_length -= 128
    903 
    904 	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
    905 
    906 	and	x1, x1, #127                    //bit_length %= 128
    907 
    908 	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
    909 	cmp	x1, #64
    910 
    911 	csel	x6, x13, x14, lt
    912 	csel	x7, x14, xzr, lt
    913 
    914 	fmov	d0, x6                                 //ctr0b is mask for last block
    915 
    916 	fmov	v0.d[1], x7
    917 
    918 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
    919 
    920 	rev64	v4.16b, v5.16b                                    //GHASH final block
    921 
    922 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    923 
    924 	mov	d8, v4.d[1]                                  //GHASH final block - mid
    925 
    926 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
    927 	ld1	{ v18.16b}, [x2]                            //load existing bytes where the possibly partial last block is to be stored
    928 
    929 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
    930 #ifndef __AARCH64EB__
    931 	rev	w9, w12
    932 #else
    933 	mov	w9, w12
    934 #endif
    935 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
    936 
    937 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
    938 
    939 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
    940 
    941 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
    942 
    943 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
    944 	movi	v8.8b, #0xc2
    945 
    946 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
    947 
    948 	shl	d8, d8, #56               //mod_constant
    949 
    950 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
    951 
    952 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
    953 
    954 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
    955 
    956 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
    957 
    958 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
    959 
    960 	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
    961 
    962 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
    963 
    964 	bif	v5.16b, v18.16b, v0.16b                              //insert existing bytes in top end of result before storing
    965 
    966 	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
    967 	st1	{ v5.16b}, [x2]                          //store all 16B
    968 
    969 	str	w9, [x16, #12]                          //store the updated counter
    970 
    971 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
    972 	ext	v11.16b, v11.16b, v11.16b, #8
    973 	rev64	v11.16b, v11.16b
    974 	mov	x0, x15
    975 	st1	{ v11.16b }, [x3]
    976 	ldp	x21, x22, [sp, #16]
    977 	ldp	x23, x24, [sp, #32]
    978 	ldp	d8, d9, [sp, #48]
    979 	ldp	d10, d11, [sp, #64]
    980 	ldp	d12, d13, [sp, #80]
    981 	ldp	d14, d15, [sp, #96]
    982 	ldp	x19, x20, [sp], #112
    983 	ret
    984 
    985 .L128_enc_ret:
    986 	mov	w0, #0x0
    987 	ret
    988 .size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
    989 .globl	aes_gcm_dec_128_kernel
    990 .type	aes_gcm_dec_128_kernel,%function
    991 .align	4
    992 aes_gcm_dec_128_kernel:
    993 	AARCH64_VALID_CALL_TARGET
    994 	cbz	x1, .L128_dec_ret
    995 	stp	x19, x20, [sp, #-112]!
    996 	mov	x16, x4
    997 	mov	x8, x5
    998 	stp	x21, x22, [sp, #16]
    999 	stp	x23, x24, [sp, #32]
   1000 	stp	d8, d9, [sp, #48]
   1001 	stp	d10, d11, [sp, #64]
   1002 	stp	d12, d13, [sp, #80]
   1003 	stp	d14, d15, [sp, #96]
   1004 
   1005 	lsr	x5, x1, #3              //byte_len
   1006 	mov	x15, x5
   1007 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   1008 #ifdef __AARCH64EB__
   1009 	rev	x10, x10
   1010 	rev	x11, x11
   1011 #endif
   1012 	ldp	x13, x14, [x8, #160]                     //load rk10
   1013 #ifdef __AARCH64EB__
   1014 	ror	x14, x14, 32
   1015 	ror	x13, x13, 32
   1016 #endif
   1017 	sub	x5, x5, #1      //byte_len - 1
   1018 	ld1	{v18.4s}, [x8], #16                                //load rk0
   1019 
   1020 	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   1021 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   1022 
   1023 	ldr	q13, [x3, #64]                         //load h2l | h2h
   1024 #ifndef __AARCH64EB__
   1025 	ext	v13.16b, v13.16b, v13.16b, #8
   1026 #endif
   1027 	lsr	x12, x11, #32
   1028 	fmov	d2, x10                               //CTR block 2
   1029 
   1030 	ld1	{v19.4s}, [x8], #16                                //load rk1
   1031 	orr	w11, w11, w11
   1032 	rev	w12, w12                                //rev_ctr32
   1033 
   1034 	fmov	d1, x10                               //CTR block 1
   1035 	add	w12, w12, #1                            //increment rev_ctr32
   1036 
   1037 	aese	v0.16b, v18.16b
   1038 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   1039 	rev	w9, w12                                 //CTR block 1
   1040 
   1041 	orr	x9, x11, x9, lsl #32            //CTR block 1
   1042 	ld1	{v20.4s}, [x8], #16                                //load rk2
   1043 	add	w12, w12, #1                            //CTR block 1
   1044 
   1045 	fmov	v1.d[1], x9                               //CTR block 1
   1046 	rev	w9, w12                                 //CTR block 2
   1047 	add	w12, w12, #1                            //CTR block 2
   1048 
   1049 	aese	v0.16b, v19.16b
   1050 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   1051 	orr	x9, x11, x9, lsl #32            //CTR block 2
   1052 
   1053 	fmov	v2.d[1], x9                               //CTR block 2
   1054 	rev	w9, w12                                 //CTR block 3
   1055 
   1056 	fmov	d3, x10                               //CTR block 3
   1057 	orr	x9, x11, x9, lsl #32            //CTR block 3
   1058 	add	w12, w12, #1                            //CTR block 3
   1059 
   1060 	fmov	v3.d[1], x9                               //CTR block 3
   1061 	add	x4, x0, x1, lsr #3   //end_input_ptr
   1062 
   1063 	aese	v1.16b, v18.16b
   1064 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   1065 	ld1	{v21.4s}, [x8], #16                                //load rk3
   1066 
   1067 	aese	v0.16b, v20.16b
   1068 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   1069 	ld1	{v22.4s}, [x8], #16                                //load rk4
   1070 
   1071 	aese	v2.16b, v18.16b
   1072 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   1073 	ld1	{v23.4s}, [x8], #16                                //load rk5
   1074 
   1075 	aese	v1.16b, v19.16b
   1076 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   1077 	ld1	{v24.4s}, [x8], #16                                //load rk6
   1078 
   1079 	aese	v3.16b, v18.16b
   1080 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   1081 
   1082 	aese	v2.16b, v19.16b
   1083 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   1084 
   1085 	aese	v1.16b, v20.16b
   1086 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   1087 
   1088 	aese	v3.16b, v19.16b
   1089 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   1090 	ld1	{ v11.16b}, [x3]
   1091 	ext	v11.16b, v11.16b, v11.16b, #8
   1092 	rev64	v11.16b, v11.16b
   1093 
   1094 	aese	v0.16b, v21.16b
   1095 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   1096 	ld1	{v25.4s}, [x8], #16                                //load rk7
   1097 
   1098 	aese	v1.16b, v21.16b
   1099 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   1100 
   1101 	aese	v3.16b, v20.16b
   1102 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   1103 
   1104 	aese	v2.16b, v20.16b
   1105 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   1106 	ld1	{v26.4s}, [x8], #16                                //load rk8
   1107 
   1108 	aese	v1.16b, v22.16b
   1109 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   1110 
   1111 	aese	v3.16b, v21.16b
   1112 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   1113 
   1114 	aese	v2.16b, v21.16b
   1115 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   1116 	ldr	q14, [x3, #80]                         //load h3l | h3h
   1117 #ifndef __AARCH64EB__
   1118 	ext	v14.16b, v14.16b, v14.16b, #8
   1119 #endif
   1120 	aese	v0.16b, v22.16b
   1121 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   1122 	ld1	{v27.4s}, [x8], #16                                //load rk9
   1123 
   1124 	aese	v1.16b, v23.16b
   1125 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   1126 
   1127 	aese	v2.16b, v22.16b
   1128 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   1129 
   1130 	aese	v3.16b, v22.16b
   1131 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   1132 
   1133 	aese	v0.16b, v23.16b
   1134 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   1135 
   1136 	aese	v2.16b, v23.16b
   1137 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   1138 	ldr	q12, [x3, #32]                         //load h1l | h1h
   1139 #ifndef __AARCH64EB__
   1140 	ext	v12.16b, v12.16b, v12.16b, #8
   1141 #endif
   1142 	aese	v3.16b, v23.16b
   1143 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   1144 
   1145 	aese	v0.16b, v24.16b
   1146 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   1147 
   1148 	aese	v1.16b, v24.16b
   1149 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   1150 
   1151 	aese	v3.16b, v24.16b
   1152 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   1153 
   1154 	aese	v2.16b, v24.16b
   1155 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   1156 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   1157 
   1158 	ldr	q15, [x3, #112]                        //load h4l | h4h
   1159 #ifndef __AARCH64EB__
   1160 	ext	v15.16b, v15.16b, v15.16b, #8
   1161 #endif
   1162 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   1163 	add	x5, x5, x0
   1164 
   1165 	aese	v1.16b, v25.16b
   1166 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   1167 
   1168 	aese	v2.16b, v25.16b
   1169 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   1170 
   1171 	aese	v0.16b, v25.16b
   1172 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   1173 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   1174 
   1175 	aese	v3.16b, v25.16b
   1176 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   1177 
   1178 	aese	v1.16b, v26.16b
   1179 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   1180 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   1181 
   1182 	aese	v2.16b, v26.16b
   1183 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   1184 
   1185 	aese	v3.16b, v26.16b
   1186 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   1187 
   1188 	aese	v0.16b, v26.16b
   1189 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   1190 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   1191 
   1192 	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
   1193 
   1194 	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
   1195 
   1196 	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
   1197 	cmp	x0, x5                   //check if we have <= 4 blocks
   1198 
   1199 	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
   1200 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   1201 	b.ge	.L128_dec_tail                                    //handle tail
   1202 
   1203 	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0 - load ciphertext; AES block 1 - load ciphertext
   1204 
   1205 	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
   1206 	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
   1207 
   1208 	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
   1209 	rev64	v4.16b, v4.16b                                    //GHASH block 0
   1210 	rev	w9, w12                                 //CTR block 4
   1211 
   1212 	orr	x9, x11, x9, lsl #32            //CTR block 4
   1213 	add	w12, w12, #1                            //CTR block 4
   1214 	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
   1215 
   1216 	rev64	v5.16b, v5.16b                                    //GHASH block 1
   1217 	mov	x19, v1.d[0]                            //AES block 1 - mov low
   1218 
   1219 	mov	x20, v1.d[1]                            //AES block 1 - mov high
   1220 
   1221 	mov	x6, v0.d[0]                            //AES block 0 - mov low
   1222 	cmp	x0, x5                   //check if we have <= 8 blocks
   1223 
   1224 	mov	x7, v0.d[1]                            //AES block 0 - mov high
   1225 
   1226 	fmov	d0, x10                               //CTR block 4
   1227 
   1228 	fmov	v0.d[1], x9                               //CTR block 4
   1229 	rev	w9, w12                                 //CTR block 5
   1230 	eor	x19, x19, x13                   //AES block 1 - round 10 low
   1231 #ifdef __AARCH64EB__
   1232 	rev	x19, x19
   1233 #endif
   1234 	fmov	d1, x10                               //CTR block 5
   1235 	add	w12, w12, #1                            //CTR block 5
   1236 	orr	x9, x11, x9, lsl #32            //CTR block 5
   1237 
   1238 	fmov	v1.d[1], x9                               //CTR block 5
   1239 	rev	w9, w12                                 //CTR block 6
   1240 	add	w12, w12, #1                            //CTR block 6
   1241 
   1242 	orr	x9, x11, x9, lsl #32            //CTR block 6
   1243 
   1244 	eor	x20, x20, x14                   //AES block 1 - round 10 high
   1245 #ifdef __AARCH64EB__
   1246 	rev	x20, x20
   1247 #endif
   1248 	eor	x6, x6, x13                   //AES block 0 - round 10 low
   1249 #ifdef __AARCH64EB__
   1250 	rev	x6, x6
   1251 #endif
   1252 	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
   1253 
   1254 	eor	x7, x7, x14                   //AES block 0 - round 10 high
   1255 #ifdef __AARCH64EB__
   1256 	rev	x7, x7
   1257 #endif
   1258 	stp	x6, x7, [x2], #16        //AES block 0 - store result
   1259 
   1260 	stp	x19, x20, [x2], #16        //AES block 1 - store result
   1261 	b.ge	.L128_dec_prepretail                              //do prepretail
   1262 
   1263 .L128_dec_main_loop:	//main	loop start
   1264 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   1265 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   1266 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   1267 
   1268 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   1269 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   1270 
   1271 	aese	v1.16b, v18.16b
   1272 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   1273 	fmov	d2, x10                               //CTR block 4k+6
   1274 
   1275 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   1276 	fmov	v2.d[1], x9                               //CTR block 4k+6
   1277 	rev	w9, w12                                 //CTR block 4k+7
   1278 
   1279 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   1280 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   1281 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
   1282 
   1283 	aese	v1.16b, v19.16b
   1284 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   1285 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   1286 
   1287 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   1288 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   1289 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   1290 
   1291 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   1292 	fmov	d3, x10                               //CTR block 4k+7
   1293 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
   1294 
   1295 	aese	v1.16b, v20.16b
   1296 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   1297 	fmov	v3.d[1], x9                               //CTR block 4k+7
   1298 
   1299 	aese	v2.16b, v18.16b
   1300 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   1301 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   1302 
   1303 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   1304 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
   1305 
   1306 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   1307 
   1308 	aese	v1.16b, v21.16b
   1309 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   1310 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   1311 
   1312 	aese	v3.16b, v18.16b
   1313 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   1314 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
   1315 
   1316 	aese	v0.16b, v18.16b
   1317 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   1318 
   1319 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   1320 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   1321 
   1322 	aese	v3.16b, v19.16b
   1323 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   1324 	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
   1325 #ifdef __AARCH64EB__
   1326 	rev	x23, x23
   1327 #endif
   1328 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
   1329 	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
   1330 #ifdef __AARCH64EB__
   1331 	rev	x22, x22
   1332 #endif
   1333 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   1334 
   1335 	aese	v0.16b, v19.16b
   1336 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   1337 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
   1338 
   1339 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   1340 
   1341 	aese	v3.16b, v20.16b
   1342 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   1343 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   1344 
   1345 	aese	v0.16b, v20.16b
   1346 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   1347 
   1348 	aese	v1.16b, v22.16b
   1349 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   1350 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
   1351 
   1352 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   1353 
   1354 	aese	v0.16b, v21.16b
   1355 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   1356 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   1357 
   1358 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   1359 
   1360 	aese	v2.16b, v19.16b
   1361 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   1362 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   1363 
   1364 	aese	v0.16b, v22.16b
   1365 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   1366 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
   1367 
   1368 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   1369 	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
   1370 #ifdef __AARCH64EB__
   1371 	rev	x24, x24
   1372 #endif
   1373 	aese	v2.16b, v20.16b
   1374 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   1375 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   1376 
   1377 	aese	v1.16b, v23.16b
   1378 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   1379 	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
   1380 #ifdef __AARCH64EB__
   1381 	rev	x21, x21
   1382 #endif
   1383 	aese	v0.16b, v23.16b
   1384 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   1385 	movi	v8.8b, #0xc2
   1386 
   1387 	aese	v2.16b, v21.16b
   1388 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   1389 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
   1390 
   1391 	aese	v1.16b, v24.16b
   1392 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   1393 
   1394 	aese	v0.16b, v24.16b
   1395 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   1396 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   1397 
   1398 	aese	v2.16b, v22.16b
   1399 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   1400 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   1401 
   1402 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   1403 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
   1404 	ld1	{v4.16b}, [x0], #16                       //AES block 4k+3 - load ciphertext
   1405 
   1406 	aese	v1.16b, v25.16b
   1407 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   1408 	add	w12, w12, #1                            //CTR block 4k+7
   1409 
   1410 	aese	v0.16b, v25.16b
   1411 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   1412 	shl	d8, d8, #56               //mod_constant
   1413 
   1414 	aese	v2.16b, v23.16b
   1415 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   1416 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   1417 
   1418 	aese	v1.16b, v26.16b
   1419 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   1420 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   1421 
   1422 	aese	v0.16b, v26.16b
   1423 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   1424 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   1425 
   1426 	aese	v3.16b, v21.16b
   1427 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   1428 	rev	w9, w12                                 //CTR block 4k+8
   1429 
   1430 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   1431 	ld1	{v5.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
   1432 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   1433 
   1434 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
   1435 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   1436 
   1437 	aese	v3.16b, v22.16b
   1438 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   1439 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   1440 
   1441 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
   1442 
   1443 	aese	v2.16b, v24.16b
   1444 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   1445 	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
   1446 
   1447 	aese	v3.16b, v23.16b
   1448 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   1449 	ld1	{v6.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
   1450 
   1451 	add	w12, w12, #1                            //CTR block 4k+8
   1452 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   1453 	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
   1454 
   1455 	aese	v2.16b, v25.16b
   1456 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   1457 	ld1	{v7.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
   1458 
   1459 	aese	v3.16b, v24.16b
   1460 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   1461 
   1462 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
   1463 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   1464 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   1465 
   1466 	aese	v2.16b, v26.16b
   1467 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   1468 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   1469 
   1470 	aese	v3.16b, v25.16b
   1471 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   1472 	fmov	d0, x10                               //CTR block 4k+8
   1473 
   1474 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   1475 	fmov	v0.d[1], x9                               //CTR block 4k+8
   1476 	rev	w9, w12                                 //CTR block 4k+9
   1477 
   1478 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
   1479 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   1480 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   1481 
   1482 	aese	v3.16b, v26.16b
   1483 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   1484 	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
   1485 #ifdef __AARCH64EB__
   1486 	rev	x7, x7
   1487 #endif
   1488 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   1489 	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
   1490 	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
   1491 #ifdef __AARCH64EB__
   1492 	rev	x6, x6
   1493 #endif
   1494 	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
   1495 	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
   1496 	add	w12, w12, #1                            //CTR block 4k+9
   1497 
   1498 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
   1499 	fmov	d1, x10                               //CTR block 4k+9
   1500 	cmp	x0, x5                   //.LOOP CONTROL
   1501 
   1502 	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
   1503 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   1504 	fmov	v1.d[1], x9                               //CTR block 4k+9
   1505 
   1506 	rev	w9, w12                                 //CTR block 4k+10
   1507 	add	w12, w12, #1                            //CTR block 4k+10
   1508 
   1509 	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
   1510 #ifdef __AARCH64EB__
   1511 	rev	x20, x20
   1512 #endif
   1513 	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
   1514 
   1515 	eor	x19, x19, x13                   //AES block 4k+5 - round 10 low
   1516 #ifdef __AARCH64EB__
   1517 	rev	x19, x19
   1518 #endif
   1519 	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
   1520 
   1521 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   1522 	b.lt	.L128_dec_main_loop
   1523 
   1524 .L128_dec_prepretail:	//PREPRETAIL
   1525 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   1526 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   1527 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
   1528 
   1529 	aese	v0.16b, v18.16b
   1530 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   1531 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   1532 
   1533 	aese	v1.16b, v18.16b
   1534 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   1535 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   1536 
   1537 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   1538 	fmov	d2, x10                               //CTR block 4k+6
   1539 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   1540 
   1541 	aese	v0.16b, v19.16b
   1542 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   1543 	fmov	v2.d[1], x9                               //CTR block 4k+6
   1544 
   1545 	rev	w9, w12                                 //CTR block 4k+7
   1546 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   1547 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
   1548 
   1549 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   1550 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   1551 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   1552 
   1553 	aese	v1.16b, v19.16b
   1554 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   1555 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   1556 
   1557 	aese	v0.16b, v20.16b
   1558 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   1559 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   1560 
   1561 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   1562 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   1563 	fmov	d3, x10                               //CTR block 4k+7
   1564 
   1565 	aese	v2.16b, v18.16b
   1566 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   1567 	fmov	v3.d[1], x9                               //CTR block 4k+7
   1568 
   1569 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
   1570 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   1571 
   1572 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   1573 
   1574 	aese	v2.16b, v19.16b
   1575 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   1576 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   1577 
   1578 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   1579 
   1580 	aese	v3.16b, v18.16b
   1581 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   1582 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   1583 
   1584 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   1585 
   1586 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   1587 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
   1588 
   1589 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   1590 
   1591 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   1592 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
   1593 
   1594 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
   1595 
   1596 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   1597 
   1598 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   1599 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   1600 
   1601 	aese	v1.16b, v20.16b
   1602 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   1603 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   1604 
   1605 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   1606 
   1607 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
   1608 	movi	v8.8b, #0xc2
   1609 
   1610 	aese	v3.16b, v19.16b
   1611 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   1612 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   1613 
   1614 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
   1615 
   1616 	aese	v2.16b, v20.16b
   1617 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   1618 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
   1619 
   1620 	aese	v3.16b, v20.16b
   1621 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   1622 	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
   1623 #ifdef __AARCH64EB__
   1624 	rev	x23, x23
   1625 #endif
   1626 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   1627 	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
   1628 #ifdef __AARCH64EB__
   1629 	rev	x21, x21
   1630 #endif
   1631 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
   1632 
   1633 	aese	v2.16b, v21.16b
   1634 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   1635 
   1636 	aese	v1.16b, v21.16b
   1637 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   1638 	shl	d8, d8, #56               //mod_constant
   1639 
   1640 	aese	v0.16b, v21.16b
   1641 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   1642 
   1643 	aese	v2.16b, v22.16b
   1644 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   1645 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   1646 
   1647 	aese	v1.16b, v22.16b
   1648 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   1649 
   1650 	aese	v3.16b, v21.16b
   1651 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   1652 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   1653 
   1654 	aese	v2.16b, v23.16b
   1655 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   1656 
   1657 	aese	v1.16b, v23.16b
   1658 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   1659 
   1660 	aese	v3.16b, v22.16b
   1661 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   1662 
   1663 	aese	v0.16b, v22.16b
   1664 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   1665 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   1666 
   1667 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   1668 
   1669 	aese	v1.16b, v24.16b
   1670 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   1671 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   1672 
   1673 	aese	v3.16b, v23.16b
   1674 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   1675 
   1676 	aese	v0.16b, v23.16b
   1677 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   1678 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   1679 
   1680 	aese	v1.16b, v25.16b
   1681 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   1682 
   1683 	aese	v2.16b, v24.16b
   1684 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   1685 
   1686 	aese	v0.16b, v24.16b
   1687 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   1688 
   1689 	aese	v1.16b, v26.16b
   1690 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   1691 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   1692 
   1693 	aese	v3.16b, v24.16b
   1694 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   1695 
   1696 	aese	v0.16b, v25.16b
   1697 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   1698 
   1699 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
   1700 
   1701 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   1702 	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
   1703 #ifdef __AARCH64EB__
   1704 	rev	x24, x24
   1705 #endif
   1706 	aese	v2.16b, v25.16b
   1707 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   1708 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   1709 
   1710 	aese	v3.16b, v25.16b
   1711 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   1712 
   1713 	aese	v0.16b, v26.16b
   1714 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   1715 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   1716 
   1717 	aese	v2.16b, v26.16b
   1718 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   1719 
   1720 	aese	v3.16b, v26.16b
   1721 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   1722 	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
   1723 #ifdef __AARCH64EB__
   1724 	rev	x22, x22
   1725 #endif
   1726 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
   1727 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   1728 
   1729 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
   1730 	add	w12, w12, #1                            //CTR block 4k+7
   1731 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   1732 
   1733 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
   1734 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   1735 .L128_dec_tail:	//TAIL
   1736 
   1737 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   1738 	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
   1739 
   1740 	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
   1741 
   1742 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   1743 
   1744 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   1745 
   1746 	cmp	x5, #48
   1747 
   1748 	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
   1749 #ifdef __AARCH64EB__
   1750 	rev	x7, x7
   1751 #endif
   1752 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   1753 	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
   1754 #ifdef __AARCH64EB__
   1755 	rev	x6, x6
   1756 #endif
   1757 	b.gt	.L128_dec_blocks_more_than_3
   1758 
   1759 	mov	v3.16b, v2.16b
   1760 	sub	w12, w12, #1
   1761 	movi	v11.8b, #0
   1762 
   1763 	movi	v9.8b, #0
   1764 	mov	v2.16b, v1.16b
   1765 
   1766 	movi	v10.8b, #0
   1767 	cmp	x5, #32
   1768 	b.gt	.L128_dec_blocks_more_than_2
   1769 
   1770 	cmp	x5, #16
   1771 
   1772 	mov	v3.16b, v1.16b
   1773 	sub	w12, w12, #1
   1774 	b.gt	.L128_dec_blocks_more_than_1
   1775 
   1776 	sub	w12, w12, #1
   1777 	b	.L128_dec_blocks_less_than_1
   1778 .L128_dec_blocks_more_than_3:	//blocks	left >  3
   1779 	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
   1780 	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
   1781 
   1782 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1783 
   1784 	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
   1785 	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
   1786 	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
   1787 
   1788 	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
   1789 	mov	x7, v0.d[1]                            //AES final-2 block - mov high
   1790 
   1791 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
   1792 	mov	x6, v0.d[0]                            //AES final-2 block - mov low
   1793 
   1794 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
   1795 
   1796 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
   1797 
   1798 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   1799 	eor	x7, x7, x14                   //AES final-2 block - round 10 high
   1800 #ifdef __AARCH64EB__
   1801 	rev	x7, x7
   1802 #endif
   1803 	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
   1804 	eor	x6, x6, x13                   //AES final-2 block - round 10 low
   1805 #ifdef __AARCH64EB__
   1806 	rev	x6, x6
   1807 #endif
   1808 .L128_dec_blocks_more_than_2:	//blocks	left >  2
   1809 
   1810 	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
   1811 	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
   1812 
   1813 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1814 
   1815 	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
   1816 	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
   1817 
   1818 	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
   1819 
   1820 	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
   1821 
   1822 	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
   1823 	mov	x6, v0.d[0]                            //AES final-1 block - mov low
   1824 
   1825 	mov	x7, v0.d[1]                            //AES final-1 block - mov high
   1826 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
   1827 
   1828 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   1829 
   1830 	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
   1831 
   1832 	eor	x6, x6, x13                   //AES final-1 block - round 10 low
   1833 #ifdef __AARCH64EB__
   1834 	rev	x6, x6
   1835 #endif
   1836 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
   1837 
   1838 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
   1839 
   1840 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
   1841 	eor	x7, x7, x14                   //AES final-1 block - round 10 high
   1842 #ifdef __AARCH64EB__
   1843 	rev	x7, x7
   1844 #endif
   1845 .L128_dec_blocks_more_than_1:	//blocks	left >  1
   1846 
   1847 	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
   1848 
   1849 	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
   1850 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1851 
   1852 	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
   1853 
   1854 	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
   1855 
   1856 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
   1857 
   1858 	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
   1859 	mov	x6, v0.d[0]                            //AES final block - mov low
   1860 
   1861 	mov	x7, v0.d[1]                            //AES final block - mov high
   1862 	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
   1863 
   1864 	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
   1865 
   1866 	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
   1867 
   1868 	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
   1869 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   1870 
   1871 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
   1872 
   1873 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
   1874 	eor	x7, x7, x14                   //AES final block - round 10 high
   1875 #ifdef __AARCH64EB__
   1876 	rev	x7, x7
   1877 #endif
   1878 	eor	x6, x6, x13                   //AES final block - round 10 low
   1879 #ifdef __AARCH64EB__
   1880 	rev	x6, x6
   1881 #endif
   1882 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
   1883 .L128_dec_blocks_less_than_1:	//blocks	left <= 1
   1884 
   1885 	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
   1886 	and	x1, x1, #127                    //bit_length %= 128
   1887 
   1888 	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
   1889 	sub	x1, x1, #128                    //bit_length -= 128
   1890 
   1891 	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
   1892 
   1893 	and	x1, x1, #127                    //bit_length %= 128
   1894 
   1895 	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
   1896 	cmp	x1, #64
   1897 
   1898 	csel	x10, x14, xzr, lt
   1899 	csel	x9, x13, x14, lt
   1900 
   1901 	fmov	d0, x9                                   //ctr0b is mask for last block
   1902 
   1903 	mov	v0.d[1], x10
   1904 
   1905 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
   1906 
   1907 	rev64	v4.16b, v5.16b                                    //GHASH final block
   1908 
   1909 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1910 
   1911 	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
   1912 
   1913 	and	x7, x7, x10
   1914 
   1915 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
   1916 	mov	d8, v4.d[1]                                  //GHASH final block - mid
   1917 
   1918 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
   1919 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
   1920 
   1921 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
   1922 
   1923 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
   1924 	bic	x4, x4, x9           //mask out low existing bytes
   1925 	and	x6, x6, x9
   1926 
   1927 #ifndef __AARCH64EB__
   1928 	rev	w9, w12
   1929 #else
   1930 	mov	w9, w12
   1931 #endif
   1932 
   1933 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
   1934 	movi	v8.8b, #0xc2
   1935 
   1936 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
   1937 
   1938 	bic	x5, x5, x10   //mask out high existing bytes
   1939 	shl	d8, d8, #56               //mod_constant
   1940 
   1941 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   1942 
   1943 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   1944 
   1945 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   1946 
   1947 	orr	x6, x6, x4
   1948 	str	w9, [x16, #12]                          //store the updated counter
   1949 
   1950 	orr	x7, x7, x5
   1951 	stp	x6, x7, [x2]
   1952 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   1953 
   1954 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   1955 
   1956 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   1957 
   1958 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   1959 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   1960 
   1961 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   1962 
   1963 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   1964 	ext	v11.16b, v11.16b, v11.16b, #8
   1965 	rev64	v11.16b, v11.16b
   1966 	mov	x0, x15
   1967 	st1	{ v11.16b }, [x3]
   1968 
   1969 	ldp	x21, x22, [sp, #16]
   1970 	ldp	x23, x24, [sp, #32]
   1971 	ldp	d8, d9, [sp, #48]
   1972 	ldp	d10, d11, [sp, #64]
   1973 	ldp	d12, d13, [sp, #80]
   1974 	ldp	d14, d15, [sp, #96]
   1975 	ldp	x19, x20, [sp], #112
   1976 	ret
   1977 
   1978 .L128_dec_ret:
   1979 	mov	w0, #0x0
   1980 	ret
   1981 .size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
   1982 .globl	aes_gcm_enc_192_kernel
   1983 .type	aes_gcm_enc_192_kernel,%function
   1984 .align	4
   1985 aes_gcm_enc_192_kernel:
   1986 	AARCH64_VALID_CALL_TARGET
   1987 	cbz	x1, .L192_enc_ret
   1988 	stp	x19, x20, [sp, #-112]!
   1989 	mov	x16, x4
   1990 	mov	x8, x5
   1991 	stp	x21, x22, [sp, #16]
   1992 	stp	x23, x24, [sp, #32]
   1993 	stp	d8, d9, [sp, #48]
   1994 	stp	d10, d11, [sp, #64]
   1995 	stp	d12, d13, [sp, #80]
   1996 	stp	d14, d15, [sp, #96]
   1997 
   1998 	ldp	x10, x11, [x16]             //ctr96_b64, ctr96_t32
   1999 #ifdef __AARCH64EB__
   2000 	rev	x10, x10
   2001 	rev	x11, x11
   2002 #endif
   2003 	ldp	x13, x14, [x8, #192]                     //load rk12
   2004 #ifdef __AARCH64EB__
   2005 	ror	x13, x13, #32
   2006 	ror	x14, x14, #32
   2007 #endif
   2008 	ld1	{v18.4s}, [x8], #16	                             //load rk0
   2009 
   2010 	ld1	{v19.4s}, [x8], #16	                             //load rk1
   2011 
   2012 	ld1	{v20.4s}, [x8], #16	                             //load rk2
   2013 
   2014 	lsr	x12, x11, #32
   2015 	ld1	{v21.4s}, [x8], #16	                             //load rk3
   2016 	orr	w11, w11, w11
   2017 
   2018 	ld1	{v22.4s}, [x8], #16	                             //load rk4
   2019 	rev	w12, w12                               //rev_ctr32
   2020 
   2021 	add	w12, w12, #1                           //increment rev_ctr32
   2022 	fmov	d3, x10                              //CTR block 3
   2023 
   2024 	rev	w9, w12                                //CTR block 1
   2025 	add	w12, w12, #1                           //CTR block 1
   2026 	fmov	d1, x10                              //CTR block 1
   2027 
   2028 	orr	x9, x11, x9, lsl #32           //CTR block 1
   2029 	ld1	{ v0.16b}, [x16]                            //special case vector load initial counter so we can start first AES block as quickly as possible
   2030 
   2031 	fmov	v1.d[1], x9                              //CTR block 1
   2032 	rev	w9, w12                                //CTR block 2
   2033 	add	w12, w12, #1                           //CTR block 2
   2034 
   2035 	fmov	d2, x10                              //CTR block 2
   2036 	orr	x9, x11, x9, lsl #32           //CTR block 2
   2037 
   2038 	fmov	v2.d[1], x9                              //CTR block 2
   2039 	rev	w9, w12                                //CTR block 3
   2040 
   2041 	orr	x9, x11, x9, lsl #32           //CTR block 3
   2042 	ld1	{v23.4s}, [x8], #16	                             //load rk5
   2043 
   2044 	fmov	v3.d[1], x9                              //CTR block 3
   2045 
   2046 	ld1	{v24.4s}, [x8], #16	                             //load rk6
   2047 
   2048 	ld1	{v25.4s}, [x8], #16	                             //load rk7
   2049 
   2050 	aese	v0.16b, v18.16b
   2051 	aesmc	v0.16b, v0.16b         //AES block 0 - round 0
   2052 	ld1	{ v11.16b}, [x3]
   2053 	ext	v11.16b, v11.16b, v11.16b, #8
   2054 	rev64	v11.16b, v11.16b
   2055 
   2056 	aese	v3.16b, v18.16b
   2057 	aesmc	v3.16b, v3.16b         //AES block 3 - round 0
   2058 	ld1	{v26.4s}, [x8], #16	                             //load rk8
   2059 
   2060 	aese	v1.16b, v18.16b
   2061 	aesmc	v1.16b, v1.16b         //AES block 1 - round 0
   2062 	ldr	q15, [x3, #112]                       //load h4l | h4h
   2063 #ifndef __AARCH64EB__
   2064 	ext	v15.16b, v15.16b, v15.16b, #8
   2065 #endif
   2066 	aese	v2.16b, v18.16b
   2067 	aesmc	v2.16b, v2.16b         //AES block 2 - round 0
   2068 	ld1	{v27.4s}, [x8], #16	                             //load rk9
   2069 
   2070 	aese	v0.16b, v19.16b
   2071 	aesmc	v0.16b, v0.16b         //AES block 0 - round 1
   2072 	ld1	{v28.4s}, [x8], #16	                         //load rk10
   2073 
   2074 	aese	v1.16b, v19.16b
   2075 	aesmc	v1.16b, v1.16b         //AES block 1 - round 1
   2076 	ldr	q12, [x3, #32]                        //load h1l | h1h
   2077 #ifndef __AARCH64EB__
   2078 	ext	v12.16b, v12.16b, v12.16b, #8
   2079 #endif
   2080 	aese	v2.16b, v19.16b
   2081 	aesmc	v2.16b, v2.16b         //AES block 2 - round 1
   2082 	ld1	{v29.4s}, [x8], #16	                         //load rk11
   2083 
   2084 	aese	v3.16b, v19.16b
   2085 	aesmc	v3.16b, v3.16b         //AES block 3 - round 1
   2086 	ldr	q14, [x3, #80]                        //load h3l | h3h
   2087 #ifndef __AARCH64EB__
   2088 	ext	v14.16b, v14.16b, v14.16b, #8
   2089 #endif
   2090 	aese	v0.16b, v20.16b
   2091 	aesmc	v0.16b, v0.16b         //AES block 0 - round 2
   2092 
   2093 	aese	v2.16b, v20.16b
   2094 	aesmc	v2.16b, v2.16b         //AES block 2 - round 2
   2095 
   2096 	aese	v3.16b, v20.16b
   2097 	aesmc	v3.16b, v3.16b         //AES block 3 - round 2
   2098 
   2099 	aese	v0.16b, v21.16b
   2100 	aesmc	v0.16b, v0.16b         //AES block 0 - round 3
   2101 	trn1	v9.2d, v14.2d,    v15.2d                     //h4h | h3h
   2102 
   2103 	aese	v2.16b, v21.16b
   2104 	aesmc	v2.16b, v2.16b         //AES block 2 - round 3
   2105 
   2106 	aese	v1.16b, v20.16b
   2107 	aesmc	v1.16b, v1.16b         //AES block 1 - round 2
   2108 	trn2	v17.2d,  v14.2d,    v15.2d                     //h4l | h3l
   2109 
   2110 	aese	v0.16b, v22.16b
   2111 	aesmc	v0.16b, v0.16b         //AES block 0 - round 4
   2112 
   2113 	aese	v3.16b, v21.16b
   2114 	aesmc	v3.16b, v3.16b         //AES block 3 - round 3
   2115 
   2116 	aese	v1.16b, v21.16b
   2117 	aesmc	v1.16b, v1.16b         //AES block 1 - round 3
   2118 
   2119 	aese	v0.16b, v23.16b
   2120 	aesmc	v0.16b, v0.16b         //AES block 0 - round 5
   2121 
   2122 	aese	v2.16b, v22.16b
   2123 	aesmc	v2.16b, v2.16b         //AES block 2 - round 4
   2124 
   2125 	aese	v1.16b, v22.16b
   2126 	aesmc	v1.16b, v1.16b         //AES block 1 - round 4
   2127 
   2128 	aese	v0.16b, v24.16b
   2129 	aesmc	v0.16b, v0.16b         //AES block 0 - round 6
   2130 
   2131 	aese	v3.16b, v22.16b
   2132 	aesmc	v3.16b, v3.16b         //AES block 3 - round 4
   2133 
   2134 	aese	v2.16b, v23.16b
   2135 	aesmc	v2.16b, v2.16b         //AES block 2 - round 5
   2136 
   2137 	aese	v1.16b, v23.16b
   2138 	aesmc	v1.16b, v1.16b         //AES block 1 - round 5
   2139 
   2140 	aese	v3.16b, v23.16b
   2141 	aesmc	v3.16b, v3.16b         //AES block 3 - round 5
   2142 
   2143 	aese	v2.16b, v24.16b
   2144 	aesmc	v2.16b, v2.16b         //AES block 2 - round 6
   2145 	ldr	q13, [x3, #64]                        //load h2l | h2h
   2146 #ifndef __AARCH64EB__
   2147 	ext	v13.16b, v13.16b, v13.16b, #8
   2148 #endif
   2149 	aese	v1.16b, v24.16b
   2150 	aesmc	v1.16b, v1.16b         //AES block 1 - round 6
   2151 
   2152 	aese	v3.16b, v24.16b
   2153 	aesmc	v3.16b, v3.16b         //AES block 3 - round 6
   2154 
   2155 	aese	v0.16b, v25.16b
   2156 	aesmc	v0.16b, v0.16b         //AES block 0 - round 7
   2157 
   2158 	aese	v1.16b, v25.16b
   2159 	aesmc	v1.16b, v1.16b         //AES block 1 - round 7
   2160 	trn2	v16.2d,  v12.2d,    v13.2d                     //h2l | h1l
   2161 
   2162 	aese	v3.16b, v25.16b
   2163 	aesmc	v3.16b, v3.16b         //AES block 3 - round 7
   2164 
   2165 	aese	v0.16b, v26.16b
   2166 	aesmc	v0.16b, v0.16b         //AES block 0 - round 8
   2167 
   2168 	aese	v2.16b, v25.16b
   2169 	aesmc	v2.16b, v2.16b         //AES block 2 - round 7
   2170 	trn1	v8.2d,    v12.2d,    v13.2d                     //h2h | h1h
   2171 
   2172 	aese	v1.16b, v26.16b
   2173 	aesmc	v1.16b, v1.16b         //AES block 1 - round 8
   2174 
   2175 	aese	v3.16b, v26.16b
   2176 	aesmc	v3.16b, v3.16b         //AES block 3 - round 8
   2177 
   2178 	aese	v2.16b, v26.16b
   2179 	aesmc	v2.16b, v2.16b         //AES block 2 - round 8
   2180 
   2181 	aese	v0.16b, v27.16b
   2182 	aesmc	v0.16b, v0.16b         //AES block 0 - round 9
   2183 
   2184 	aese	v3.16b, v27.16b
   2185 	aesmc	v3.16b, v3.16b         //AES block 3 - round 9
   2186 
   2187 	aese	v2.16b, v27.16b
   2188 	aesmc	v2.16b, v2.16b         //AES block 2 - round 9
   2189 
   2190 	aese	v1.16b, v27.16b
   2191 	aesmc	v1.16b, v1.16b         //AES block 1 - round 9
   2192 
   2193 	aese	v0.16b, v28.16b
   2194 	aesmc	v0.16b, v0.16b         //AES block 0 - round 10
   2195 
   2196 	aese	v2.16b, v28.16b
   2197 	aesmc	v2.16b, v2.16b         //AES block 2 - round 10
   2198 
   2199 	aese	v1.16b, v28.16b
   2200 	aesmc	v1.16b, v1.16b         //AES block 1 - round 10
   2201 	lsr	x5, x1, #3             //byte_len
   2202 	mov	x15, x5
   2203 
   2204 	aese	v3.16b, v28.16b
   2205 	aesmc	v3.16b, v3.16b         //AES block 3 - round 10
   2206 	sub	x5, x5, #1     //byte_len - 1
   2207 
   2208 	eor	v16.16b, v16.16b, v8.16b                    //h2k | h1k
   2209 	and	x5, x5, #0xffffffffffffffc0   //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   2210 
   2211 	eor	v17.16b, v17.16b, v9.16b                 //h4k | h3k
   2212 
   2213 	aese	v2.16b, v29.16b                                    //AES block 2 - round 11
   2214 	add	x4, x0, x1, lsr #3  //end_input_ptr
   2215 	add	x5, x5, x0
   2216 
   2217 	aese	v1.16b, v29.16b                                    //AES block 1 - round 11
   2218 	cmp	x0, x5                  //check if we have <= 4 blocks
   2219 
   2220 	aese	v0.16b, v29.16b                                    //AES block 0 - round 11
   2221 	add	w12, w12, #1                           //CTR block 3
   2222 
   2223 	aese	v3.16b, v29.16b                                    //AES block 3 - round 11
   2224 	b.ge	.L192_enc_tail                                   //handle tail
   2225 
   2226 	rev	w9, w12                                //CTR block 4
   2227 	ldp	x6, x7, [x0, #0]           //AES block 0 - load plaintext
   2228 #ifdef __AARCH64EB__
   2229 	rev	x6, x6
   2230 	rev	x7, x7
   2231 #endif
   2232 	orr	x9, x11, x9, lsl #32           //CTR block 4
   2233 	ldp	x21, x22, [x0, #32]          //AES block 2 - load plaintext
   2234 #ifdef __AARCH64EB__
   2235 	rev	x21, x21
   2236 	rev	x22, x22
   2237 #endif
   2238 	ldp	x23, x24, [x0, #48]          //AES block 3 - load plaintext
   2239 #ifdef __AARCH64EB__
   2240 	rev	x23, x23
   2241 	rev	x24, x24
   2242 #endif
   2243 	ldp	x19, x20, [x0, #16]          //AES block 1 - load plaintext
   2244 #ifdef __AARCH64EB__
   2245 	rev	x19, x19
   2246 	rev	x20, x20
   2247 #endif
   2248 	add	x0, x0, #64                      //AES input_ptr update
   2249 	cmp	x0, x5                  //check if we have <= 8 blocks
   2250 
   2251 	eor	x6, x6, x13                    //AES block 0 - round 12 low
   2252 
   2253 	eor	x7, x7, x14                    //AES block 0 - round 12 high
   2254 	eor	x22, x22, x14                    //AES block 2 - round 12 high
   2255 	fmov	d4, x6                              //AES block 0 - mov low
   2256 
   2257 	eor	x24, x24, x14                    //AES block 3 - round 12 high
   2258 	fmov	v4.d[1], x7                          //AES block 0 - mov high
   2259 
   2260 	eor	x21, x21, x13                    //AES block 2 - round 12 low
   2261 	eor	x19, x19, x13                    //AES block 1 - round 12 low
   2262 
   2263 	fmov	d5, x19                              //AES block 1 - mov low
   2264 	eor	x20, x20, x14                    //AES block 1 - round 12 high
   2265 
   2266 	fmov	v5.d[1], x20                          //AES block 1 - mov high
   2267 
   2268 	eor	x23, x23, x13                    //AES block 3 - round 12 low
   2269 	fmov	d6, x21                              //AES block 2 - mov low
   2270 
   2271 	add	w12, w12, #1                           //CTR block 4
   2272 	eor	v4.16b, v4.16b, v0.16b                         //AES block 0 - result
   2273 	fmov	d0, x10                              //CTR block 4
   2274 
   2275 	fmov	v0.d[1], x9                              //CTR block 4
   2276 	rev	w9, w12                                //CTR block 5
   2277 
   2278 	orr	x9, x11, x9, lsl #32           //CTR block 5
   2279 	add	w12, w12, #1                           //CTR block 5
   2280 
   2281 	fmov	d7, x23                              //AES block 3 - mov low
   2282 	st1	{ v4.16b}, [x2], #16                    //AES block 0 - store result
   2283 
   2284 	fmov	v6.d[1], x22                          //AES block 2 - mov high
   2285 
   2286 	eor	v5.16b, v5.16b, v1.16b                         //AES block 1 - result
   2287 	fmov	d1, x10                              //CTR block 5
   2288 	st1	{ v5.16b}, [x2], #16                    //AES block 1 - store result
   2289 
   2290 	fmov	v7.d[1], x24                          //AES block 3 - mov high
   2291 
   2292 	fmov	v1.d[1], x9                              //CTR block 5
   2293 	rev	w9, w12                                //CTR block 6
   2294 
   2295 	orr	x9, x11, x9, lsl #32           //CTR block 6
   2296 
   2297 	add	w12, w12, #1                           //CTR block 6
   2298 	eor	v6.16b, v6.16b, v2.16b                         //AES block 2 - result
   2299 	fmov	d2, x10                              //CTR block 6
   2300 
   2301 	fmov	v2.d[1], x9                              //CTR block 6
   2302 	rev	w9, w12                                //CTR block 7
   2303 
   2304 	orr	x9, x11, x9, lsl #32           //CTR block 7
   2305 	st1	{ v6.16b}, [x2], #16                    //AES block 2 - store result
   2306 
   2307 	eor	v7.16b, v7.16b, v3.16b                         //AES block 3 - result
   2308 	st1	{ v7.16b}, [x2], #16                    //AES block 3 - store result
   2309 	b.ge	.L192_enc_prepretail                             //do prepretail
   2310 
   2311 .L192_enc_main_loop:	//main	loop start
   2312 	aese	v2.16b, v18.16b
   2313 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
   2314 	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
   2315 
   2316 	aese	v1.16b, v18.16b
   2317 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
   2318 	ldp	x19, x20, [x0, #16]          //AES block 4k+5 - load plaintext
   2319 #ifdef __AARCH64EB__
   2320 	rev	x19, x19
   2321 	rev	x20, x20
   2322 #endif
   2323 	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
   2324 	fmov	d3, x10                              //CTR block 4k+3
   2325 	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
   2326 
   2327 	aese	v2.16b, v19.16b
   2328 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
   2329 	fmov	v3.d[1], x9                              //CTR block 4k+3
   2330 
   2331 	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
   2332 	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2333 	ldp	x21, x22, [x0, #32]          //AES block 4k+6 - load plaintext
   2334 #ifdef __AARCH64EB__
   2335 	rev	x21, x21
   2336 	rev	x22, x22
   2337 #endif
   2338 	aese	v0.16b, v18.16b
   2339 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
   2340 	ldp	x23, x24, [x0, #48]          //AES block 4k+3 - load plaintext
   2341 #ifdef __AARCH64EB__
   2342 	rev	x23, x23
   2343 	rev	x24, x24
   2344 #endif
   2345 	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
   2346 	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
   2347 
   2348 	aese	v1.16b, v19.16b
   2349 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
   2350 
   2351 	aese	v0.16b, v19.16b
   2352 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
   2353 	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
   2354 
   2355 	aese	v3.16b, v18.16b
   2356 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
   2357 	eor	x24, x24, x14                    //AES block 4k+3 - round 12 high
   2358 
   2359 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
   2360 	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
   2361 
   2362 	aese	v0.16b, v20.16b
   2363 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
   2364 
   2365 	aese	v3.16b, v19.16b
   2366 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
   2367 	eor	x21, x21, x13                    //AES block 4k+6 - round 12 low
   2368 
   2369 	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
   2370 	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
   2371 
   2372 	aese	v0.16b, v21.16b
   2373 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
   2374 	eor	x19, x19, x13                    //AES block 4k+5 - round 12 low
   2375 
   2376 	aese	v1.16b, v20.16b
   2377 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
   2378 	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
   2379 
   2380 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
   2381 	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
   2382 
   2383 	aese	v2.16b, v20.16b
   2384 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
   2385 
   2386 	aese	v1.16b, v21.16b
   2387 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
   2388 
   2389 	mov	d10, v17.d[1]                              //GHASH block 4k - mid
   2390 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
   2391 
   2392 	aese	v3.16b, v20.16b
   2393 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
   2394 	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
   2395 
   2396 	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
   2397 
   2398 	aese	v0.16b, v22.16b
   2399 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
   2400 	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
   2401 
   2402 	aese	v3.16b, v21.16b
   2403 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
   2404 
   2405 	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
   2406 	eor	x20, x20, x14                    //AES block 4k+5 - round 12 high
   2407 	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
   2408 
   2409 	aese	v0.16b, v23.16b
   2410 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
   2411 	add	w12, w12, #1                           //CTR block 4k+3
   2412 
   2413 	aese	v3.16b, v22.16b
   2414 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
   2415 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
   2416 
   2417 	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
   2418 	eor	x22, x22, x14                    //AES block 4k+6 - round 12 high
   2419 
   2420 	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
   2421 	eor	x23, x23, x13                    //AES block 4k+3 - round 12 low
   2422 	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
   2423 
   2424 	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
   2425 	rev	w9, w12                                //CTR block 4k+8
   2426 
   2427 	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
   2428 	orr	x9, x11, x9, lsl #32           //CTR block 4k+8
   2429 
   2430 	aese	v2.16b, v21.16b
   2431 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
   2432 	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
   2433 
   2434 	aese	v1.16b, v22.16b
   2435 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
   2436 	ldp	x6, x7, [x0, #0]           //AES block 4k+4 - load plaintext
   2437 #ifdef __AARCH64EB__
   2438 	rev	x6, x6
   2439 	rev	x7, x7
   2440 #endif
   2441 	aese	v0.16b, v24.16b
   2442 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
   2443 	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
   2444 
   2445 	aese	v2.16b, v22.16b
   2446 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
   2447 	add	x0, x0, #64                      //AES input_ptr update
   2448 
   2449 	aese	v1.16b, v23.16b
   2450 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
   2451 	movi	v8.8b, #0xc2
   2452 
   2453 	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
   2454 	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
   2455 	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
   2456 
   2457 	aese	v2.16b, v23.16b
   2458 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
   2459 	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
   2460 
   2461 	aese	v1.16b, v24.16b
   2462 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
   2463 	shl	d8, d8, #56              //mod_constant
   2464 
   2465 	aese	v3.16b, v23.16b
   2466 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
   2467 	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
   2468 
   2469 	aese	v0.16b, v25.16b
   2470 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
   2471 	fmov	d5, x19                              //AES block 4k+5 - mov low
   2472 
   2473 	aese	v1.16b, v25.16b
   2474 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
   2475 	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
   2476 
   2477 	aese	v3.16b, v24.16b
   2478 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
   2479 	fmov	v5.d[1], x20                          //AES block 4k+5 - mov high
   2480 
   2481 	aese	v0.16b, v26.16b
   2482 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
   2483 	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
   2484 
   2485 	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
   2486 	cmp	x0, x5                  //.LOOP CONTROL
   2487 	fmov	d4, x6                              //AES block 4k+4 - mov low
   2488 
   2489 	aese	v2.16b, v24.16b
   2490 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
   2491 	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
   2492 
   2493 	aese	v1.16b, v26.16b
   2494 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
   2495 	fmov	d7, x23                              //AES block 4k+3 - mov low
   2496 
   2497 	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
   2498 	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
   2499 	add	w12, w12, #1                           //CTR block 4k+8
   2500 
   2501 	aese	v2.16b, v25.16b
   2502 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
   2503 	fmov	v7.d[1], x24                          //AES block 4k+3 - mov high
   2504 
   2505 	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
   2506 	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
   2507 	fmov	d6, x21                              //AES block 4k+6 - mov low
   2508 
   2509 	aese	v3.16b, v25.16b
   2510 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
   2511 
   2512 	aese	v0.16b, v27.16b
   2513 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
   2514 	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
   2515 
   2516 	aese	v2.16b, v26.16b
   2517 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
   2518 
   2519 	aese	v3.16b, v26.16b
   2520 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
   2521 
   2522 	aese	v1.16b, v27.16b
   2523 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
   2524 
   2525 	aese	v0.16b, v28.16b
   2526 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
   2527 	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
   2528 
   2529 	aese	v3.16b, v27.16b
   2530 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
   2531 
   2532 	aese	v2.16b, v27.16b
   2533 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
   2534 
   2535 	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
   2536 
   2537 	aese	v1.16b, v28.16b
   2538 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
   2539 	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
   2540 
   2541 	aese	v2.16b, v28.16b
   2542 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
   2543 
   2544 	eor	v4.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
   2545 	fmov	d0, x10                              //CTR block 4k+8
   2546 
   2547 	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
   2548 	fmov	v0.d[1], x9                              //CTR block 4k+8
   2549 	rev	w9, w12                                //CTR block 4k+9
   2550 
   2551 	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
   2552 	fmov	v6.d[1], x22                          //AES block 4k+6 - mov high
   2553 	st1	{ v4.16b}, [x2], #16                    //AES block 4k+4 - store result
   2554 
   2555 	aese	v3.16b, v28.16b
   2556 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
   2557 	orr	x9, x11, x9, lsl #32           //CTR block 4k+9
   2558 
   2559 	eor	v5.16b, v5.16b, v1.16b                         //AES block 4k+5 - result
   2560 	add	w12, w12, #1                           //CTR block 4k+9
   2561 	fmov	d1, x10                              //CTR block 4k+9
   2562 
   2563 	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
   2564 	fmov	v1.d[1], x9                              //CTR block 4k+9
   2565 	rev	w9, w12                                //CTR block 4k+10
   2566 
   2567 	add	w12, w12, #1                           //CTR block 4k+10
   2568 	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
   2569 	orr	x9, x11, x9, lsl #32           //CTR block 4k+10
   2570 
   2571 	st1	{ v5.16b}, [x2], #16                    //AES block 4k+5 - store result
   2572 	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
   2573 
   2574 	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
   2575 	eor	v6.16b, v6.16b, v2.16b                         //AES block 4k+6 - result
   2576 	fmov	d2, x10                              //CTR block 4k+10
   2577 
   2578 	st1	{ v6.16b}, [x2], #16                    //AES block 4k+6 - store result
   2579 	fmov	v2.d[1], x9                              //CTR block 4k+10
   2580 	rev	w9, w12                                //CTR block 4k+11
   2581 
   2582 	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
   2583 	orr	x9, x11, x9, lsl #32           //CTR block 4k+11
   2584 
   2585 	eor	v7.16b, v7.16b, v3.16b                         //AES block 4k+3 - result
   2586 	st1	{ v7.16b}, [x2], #16                    //AES block 4k+3 - store result
   2587 	b.lt	.L192_enc_main_loop
   2588 
   2589 .L192_enc_prepretail:	//PREPRETAIL
   2590 	aese	v0.16b, v18.16b
   2591 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
   2592 	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
   2593 
   2594 	fmov	d3, x10                              //CTR block 4k+3
   2595 	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
   2596 	add	w12, w12, #1                           //CTR block 4k+3
   2597 
   2598 	aese	v1.16b, v18.16b
   2599 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
   2600 	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
   2601 
   2602 	aese	v2.16b, v18.16b
   2603 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
   2604 
   2605 	fmov	v3.d[1], x9                              //CTR block 4k+3
   2606 	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
   2607 	mov	d10, v17.d[1]                              //GHASH block 4k - mid
   2608 
   2609 	aese	v1.16b, v19.16b
   2610 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
   2611 	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
   2612 
   2613 	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
   2614 
   2615 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
   2616 	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
   2617 
   2618 	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
   2619 	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2620 
   2621 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
   2622 
   2623 	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
   2624 	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
   2625 
   2626 	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
   2627 	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
   2628 
   2629 	aese	v3.16b, v18.16b
   2630 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
   2631 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
   2632 
   2633 	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
   2634 
   2635 	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
   2636 	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
   2637 
   2638 	aese	v3.16b, v19.16b
   2639 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
   2640 
   2641 	aese	v2.16b, v19.16b
   2642 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
   2643 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
   2644 
   2645 	aese	v0.16b, v19.16b
   2646 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
   2647 
   2648 	aese	v1.16b, v20.16b
   2649 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
   2650 	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
   2651 
   2652 	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
   2653 	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
   2654 
   2655 	aese	v0.16b, v20.16b
   2656 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
   2657 
   2658 	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
   2659 	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
   2660 
   2661 	aese	v1.16b, v21.16b
   2662 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
   2663 
   2664 	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
   2665 
   2666 	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
   2667 
   2668 	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
   2669 	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
   2670 
   2671 	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
   2672 
   2673 	aese	v0.16b, v21.16b
   2674 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
   2675 	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
   2676 
   2677 	aese	v3.16b, v20.16b
   2678 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
   2679 
   2680 	aese	v2.16b, v20.16b
   2681 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
   2682 	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
   2683 
   2684 	aese	v0.16b, v22.16b
   2685 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
   2686 
   2687 	aese	v3.16b, v21.16b
   2688 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
   2689 	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
   2690 
   2691 	aese	v2.16b, v21.16b
   2692 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
   2693 
   2694 	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
   2695 	movi	v8.8b, #0xc2
   2696 
   2697 	aese	v3.16b, v22.16b
   2698 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
   2699 
   2700 	aese	v2.16b, v22.16b
   2701 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
   2702 
   2703 	aese	v1.16b, v22.16b
   2704 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
   2705 	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
   2706 
   2707 	aese	v3.16b, v23.16b
   2708 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
   2709 
   2710 	aese	v2.16b, v23.16b
   2711 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
   2712 
   2713 	aese	v1.16b, v23.16b
   2714 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
   2715 	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
   2716 
   2717 	aese	v0.16b, v23.16b
   2718 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
   2719 
   2720 	aese	v3.16b, v24.16b
   2721 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
   2722 	eor	v10.16b, v10.16b, v9.16b                        //karatsuba tidy up
   2723 
   2724 	aese	v1.16b, v24.16b
   2725 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
   2726 
   2727 	aese	v0.16b, v24.16b
   2728 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
   2729 	shl	d8, d8, #56              //mod_constant
   2730 
   2731 	aese	v3.16b, v25.16b
   2732 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
   2733 
   2734 	aese	v1.16b, v25.16b
   2735 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
   2736 	eor	v10.16b, v10.16b, v11.16b
   2737 
   2738 	aese	v0.16b, v25.16b
   2739 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
   2740 
   2741 	pmull	v30.1q, v9.1d, v8.1d
   2742 
   2743 	aese	v2.16b, v24.16b
   2744 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
   2745 	ext	v9.16b, v9.16b, v9.16b, #8
   2746 
   2747 	aese	v0.16b, v26.16b
   2748 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
   2749 
   2750 	aese	v1.16b, v26.16b
   2751 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
   2752 	eor	v10.16b, v10.16b, v30.16b
   2753 
   2754 	aese	v2.16b, v25.16b
   2755 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
   2756 
   2757 	aese	v3.16b, v26.16b
   2758 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
   2759 
   2760 	aese	v0.16b, v27.16b
   2761 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
   2762 
   2763 	aese	v2.16b, v26.16b
   2764 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
   2765 	eor	v10.16b, v10.16b, v9.16b
   2766 
   2767 	aese	v3.16b, v27.16b
   2768 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
   2769 
   2770 	aese	v1.16b, v27.16b
   2771 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
   2772 
   2773 	aese	v2.16b, v27.16b
   2774 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
   2775 
   2776 	pmull	v30.1q, v10.1d, v8.1d
   2777 
   2778 	ext	v10.16b, v10.16b, v10.16b, #8
   2779 
   2780 	aese	v3.16b, v28.16b
   2781 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
   2782 
   2783 	aese	v0.16b, v28.16b
   2784 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
   2785 
   2786 	aese	v2.16b, v28.16b
   2787 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
   2788 
   2789 	aese	v1.16b, v28.16b
   2790 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
   2791 	eor	v11.16b, v11.16b, v30.16b
   2792 
   2793 	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
   2794 
   2795 	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
   2796 
   2797 	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
   2798 
   2799 	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
   2800 	eor	v11.16b, v11.16b, v10.16b
   2801 .L192_enc_tail:	//TAIL
   2802 
   2803 	sub	x5, x4, x0  //main_end_input_ptr is number of bytes left to process
   2804 	ldp	x6, x7, [x0], #16          //AES block 4k+4 - load plaintext
   2805 #ifdef __AARCH64EB__
   2806 	rev	x6, x6
   2807 	rev	x7, x7
   2808 #endif
   2809 	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
   2810 	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
   2811 
   2812 	fmov	d4, x6                              //AES block 4k+4 - mov low
   2813 
   2814 	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
   2815 	cmp	x5, #48
   2816 
   2817 	eor	v5.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
   2818 
   2819 	ext	v8.16b, v11.16b, v11.16b, #8                    //prepare final partial tag
   2820 	b.gt	.L192_enc_blocks_more_than_3
   2821 
   2822 	sub	w12, w12, #1
   2823 	movi	v10.8b, #0
   2824 
   2825 	mov	v3.16b, v2.16b
   2826 	movi	v9.8b, #0
   2827 	cmp	x5, #32
   2828 
   2829 	mov	v2.16b, v1.16b
   2830 	movi	v11.8b, #0
   2831 	b.gt	.L192_enc_blocks_more_than_2
   2832 
   2833 	sub	w12, w12, #1
   2834 
   2835 	mov	v3.16b, v1.16b
   2836 	cmp	x5, #16
   2837 	b.gt	.L192_enc_blocks_more_than_1
   2838 
   2839 	sub	w12, w12, #1
   2840 	b	.L192_enc_blocks_less_than_1
   2841 .L192_enc_blocks_more_than_3:	//blocks	left >  3
   2842 	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
   2843 
   2844 	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
   2845 #ifdef __AARCH64EB__
   2846 	rev	x6, x6
   2847 	rev	x7, x7
   2848 #endif
   2849 	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
   2850 
   2851 	eor	x6, x6, x13                    //AES final-2 block - round 12 low
   2852 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2853 
   2854 	eor	x7, x7, x14                    //AES final-2 block - round 12 high
   2855 	fmov	d5, x6                                //AES final-2 block - mov low
   2856 
   2857 	fmov	v5.d[1], x7                            //AES final-2 block - mov high
   2858 
   2859 	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
   2860 
   2861 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
   2862 
   2863 	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
   2864 
   2865 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
   2866 
   2867 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   2868 
   2869 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
   2870 
   2871 	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
   2872 	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
   2873 .L192_enc_blocks_more_than_2:	//blocks	left >  2
   2874 
   2875 	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
   2876 
   2877 	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
   2878 	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
   2879 #ifdef __AARCH64EB__
   2880 	rev	x6, x6
   2881 	rev	x7, x7
   2882 #endif
   2883 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2884 
   2885 	eor	x7, x7, x14                    //AES final-1 block - round 12 high
   2886 
   2887 	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
   2888 	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
   2889 
   2890 	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
   2891 	eor	x6, x6, x13                    //AES final-1 block - round 12 low
   2892 
   2893 	fmov	d5, x6                                //AES final-1 block - mov low
   2894 
   2895 	fmov	v5.d[1], x7                            //AES final-1 block - mov high
   2896 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
   2897 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
   2898 
   2899 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
   2900 
   2901 	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
   2902 
   2903 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   2904 
   2905 	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
   2906 
   2907 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
   2908 .L192_enc_blocks_more_than_1:	//blocks	left >  1
   2909 
   2910 	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
   2911 
   2912 	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
   2913 #ifdef __AARCH64EB__
   2914 	rev	x6, x6
   2915 	rev	x7, x7
   2916 #endif
   2917 	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
   2918 
   2919 	eor	x6, x6, x13                    //AES final block - round 12 low
   2920 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2921 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   2922 
   2923 	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
   2924 
   2925 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
   2926 	eor	x7, x7, x14                    //AES final block - round 12 high
   2927 	fmov	d5, x6                                //AES final block - mov low
   2928 
   2929 	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
   2930 	fmov	v5.d[1], x7                            //AES final block - mov high
   2931 
   2932 	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
   2933 
   2934 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
   2935 
   2936 	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
   2937 
   2938 	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
   2939 
   2940 	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
   2941 
   2942 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
   2943 
   2944 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
   2945 .L192_enc_blocks_less_than_1:	//blocks	left <= 1
   2946 
   2947 	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
   2948 #ifndef __AARCH64EB__
   2949 	rev	w9, w12
   2950 #else
   2951 	mov	w9, w12
   2952 #endif
   2953 	and	x1, x1, #127                   //bit_length %= 128
   2954 
   2955 	sub	x1, x1, #128                   //bit_length -= 128
   2956 	mvn	x14, xzr                                     //rk12_h = 0xffffffffffffffff
   2957 
   2958 	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
   2959 	mvn	x13, xzr                                     //rk12_l = 0xffffffffffffffff
   2960 
   2961 	and	x1, x1, #127                   //bit_length %= 128
   2962 
   2963 	lsr	x14, x14, x1                    //rk12_h is mask for top 64b of last block
   2964 	cmp	x1, #64
   2965 
   2966 	csel	x6, x13, x14, lt
   2967 	csel	x7, x14, xzr, lt
   2968 
   2969 	fmov	d0, x6                                //ctr0b is mask for last block
   2970 
   2971 	fmov	v0.d[1], x7
   2972 
   2973 	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
   2974 
   2975 	rev64	v4.16b, v5.16b                                   //GHASH final block
   2976 
   2977 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2978 
   2979 	mov	d8, v4.d[1]                                 //GHASH final block - mid
   2980 
   2981 	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
   2982 
   2983 	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
   2984 
   2985 	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
   2986 
   2987 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
   2988 
   2989 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
   2990 
   2991 	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
   2992 
   2993 	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
   2994 	movi	v8.8b, #0xc2
   2995 
   2996 	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
   2997 
   2998 	shl	d8, d8, #56              //mod_constant
   2999 
   3000 	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
   3001 
   3002 	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
   3003 
   3004 	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
   3005 
   3006 	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
   3007 
   3008 	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
   3009 
   3010 	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
   3011 
   3012 	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
   3013 
   3014 	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
   3015 
   3016 	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
   3017 	str	w9, [x16, #12]                         //store the updated counter
   3018 
   3019 	st1	{ v5.16b}, [x2]                         //store all 16B
   3020 
   3021 	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
   3022 	ext	v11.16b, v11.16b, v11.16b, #8
   3023 	rev64	v11.16b, v11.16b
   3024 	mov	x0, x15
   3025 	st1	{ v11.16b }, [x3]
   3026 
   3027 	ldp	x21, x22, [sp, #16]
   3028 	ldp	x23, x24, [sp, #32]
   3029 	ldp	d8, d9, [sp, #48]
   3030 	ldp	d10, d11, [sp, #64]
   3031 	ldp	d12, d13, [sp, #80]
   3032 	ldp	d14, d15, [sp, #96]
   3033 	ldp	x19, x20, [sp], #112
   3034 	ret
   3035 
   3036 .L192_enc_ret:
   3037 	mov	w0, #0x0
   3038 	ret
   3039 .size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
   3040 .globl	aes_gcm_dec_192_kernel
   3041 .type	aes_gcm_dec_192_kernel,%function
   3042 .align	4
   3043 aes_gcm_dec_192_kernel:
   3044 	AARCH64_VALID_CALL_TARGET
   3045 	cbz	x1, .L192_dec_ret
   3046 	stp	x19, x20, [sp, #-112]!
   3047 	mov	x16, x4
   3048 	mov	x8, x5
   3049 	stp	x21, x22, [sp, #16]
   3050 	stp	x23, x24, [sp, #32]
   3051 	stp	d8, d9, [sp, #48]
   3052 	stp	d10, d11, [sp, #64]
   3053 	stp	d12, d13, [sp, #80]
   3054 	stp	d14, d15, [sp, #96]
   3055 
   3056 	add	x4, x0, x1, lsr #3   //end_input_ptr
   3057 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   3058 #ifdef __AARCH64EB__
   3059 	rev	x10, x10
   3060 	rev	x11, x11
   3061 #endif
   3062 	ldp	x13, x14, [x8, #192]                     //load rk12
   3063 #ifdef __AARCH64EB__
   3064 	ror	x13, x13, #32
   3065 	ror	x14, x14, #32
   3066 #endif
   3067 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   3068 
   3069 	ld1	{v18.4s}, [x8], #16                                  //load rk0
   3070 
   3071 	lsr	x5, x1, #3              //byte_len
   3072 	mov	x15, x5
   3073 	ld1	{v19.4s}, [x8], #16                               //load rk1
   3074 
   3075 	lsr	x12, x11, #32
   3076 	orr	w11, w11, w11
   3077 	fmov	d3, x10                               //CTR block 3
   3078 
   3079 	rev	w12, w12                                //rev_ctr32
   3080 	fmov	d1, x10                               //CTR block 1
   3081 
   3082 	add	w12, w12, #1                            //increment rev_ctr32
   3083 	ld1	{v20.4s}, [x8], #16                               //load rk2
   3084 
   3085 	aese	v0.16b, v18.16b
   3086 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   3087 	rev	w9, w12                                 //CTR block 1
   3088 
   3089 	add	w12, w12, #1                            //CTR block 1
   3090 	orr	x9, x11, x9, lsl #32            //CTR block 1
   3091 	ld1	{v21.4s}, [x8], #16                               //load rk3
   3092 
   3093 	fmov	v1.d[1], x9                               //CTR block 1
   3094 	rev	w9, w12                                 //CTR block 2
   3095 	add	w12, w12, #1                            //CTR block 2
   3096 
   3097 	fmov	d2, x10                               //CTR block 2
   3098 	orr	x9, x11, x9, lsl #32            //CTR block 2
   3099 
   3100 	fmov	v2.d[1], x9                               //CTR block 2
   3101 	rev	w9, w12                                 //CTR block 3
   3102 
   3103 	aese	v0.16b, v19.16b
   3104 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   3105 	orr	x9, x11, x9, lsl #32            //CTR block 3
   3106 
   3107 	fmov	v3.d[1], x9                               //CTR block 3
   3108 
   3109 	ld1	{v22.4s}, [x8], #16                               //load rk4
   3110 
   3111 	aese	v0.16b, v20.16b
   3112 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   3113 
   3114 	aese	v2.16b, v18.16b
   3115 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   3116 	ld1	{v23.4s}, [x8], #16                               //load rk5
   3117 
   3118 	aese	v1.16b, v18.16b
   3119 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   3120 	ldr	q15, [x3, #112]                        //load h4l | h4h
   3121 #ifndef __AARCH64EB__
   3122 	ext	v15.16b, v15.16b, v15.16b, #8
   3123 #endif
   3124 	aese	v3.16b, v18.16b
   3125 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   3126 	ldr	q13, [x3, #64]                         //load h2l | h2h
   3127 #ifndef __AARCH64EB__
   3128 	ext	v13.16b, v13.16b, v13.16b, #8
   3129 #endif
   3130 	aese	v2.16b, v19.16b
   3131 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   3132 	ldr	q14, [x3, #80]                         //load h3l | h3h
   3133 #ifndef __AARCH64EB__
   3134 	ext	v14.16b, v14.16b, v14.16b, #8
   3135 #endif
   3136 	aese	v1.16b, v19.16b
   3137 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   3138 
   3139 	aese	v3.16b, v19.16b
   3140 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   3141 	ldr	q12, [x3, #32]                         //load h1l | h1h
   3142 #ifndef __AARCH64EB__
   3143 	ext	v12.16b, v12.16b, v12.16b, #8
   3144 #endif
   3145 	aese	v2.16b, v20.16b
   3146 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   3147 	ld1	{v24.4s}, [x8], #16                               //load rk6
   3148 
   3149 	aese	v0.16b, v21.16b
   3150 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   3151 	ld1	{v25.4s}, [x8], #16                               //load rk7
   3152 
   3153 	aese	v1.16b, v20.16b
   3154 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   3155 	ld1	{v26.4s}, [x8], #16                               //load rk8
   3156 
   3157 	aese	v3.16b, v20.16b
   3158 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   3159 	ld1	{v27.4s}, [x8], #16                               //load rk9
   3160 
   3161 	aese	v2.16b, v21.16b
   3162 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   3163 	ld1	{ v11.16b}, [x3]
   3164 	ext	v11.16b, v11.16b, v11.16b, #8
   3165 	rev64	v11.16b, v11.16b
   3166 
   3167 	aese	v1.16b, v21.16b
   3168 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   3169 	add	w12, w12, #1                            //CTR block 3
   3170 
   3171 	aese	v3.16b, v21.16b
   3172 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   3173 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   3174 
   3175 	aese	v0.16b, v22.16b
   3176 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   3177 	ld1	{v28.4s}, [x8], #16                              //load rk10
   3178 
   3179 	aese	v1.16b, v22.16b
   3180 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   3181 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   3182 
   3183 	aese	v2.16b, v22.16b
   3184 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   3185 
   3186 	aese	v3.16b, v22.16b
   3187 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   3188 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   3189 
   3190 	aese	v0.16b, v23.16b
   3191 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   3192 	ld1	{v29.4s}, [x8], #16                              //load rk11
   3193 
   3194 	aese	v1.16b, v23.16b
   3195 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   3196 
   3197 	aese	v2.16b, v23.16b
   3198 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   3199 
   3200 	aese	v3.16b, v23.16b
   3201 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   3202 
   3203 	aese	v0.16b, v24.16b
   3204 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   3205 
   3206 	aese	v2.16b, v24.16b
   3207 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   3208 
   3209 	aese	v3.16b, v24.16b
   3210 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   3211 
   3212 	aese	v0.16b, v25.16b
   3213 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   3214 
   3215 	aese	v2.16b, v25.16b
   3216 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   3217 
   3218 	aese	v3.16b, v25.16b
   3219 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   3220 
   3221 	aese	v1.16b, v24.16b
   3222 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   3223 
   3224 	aese	v2.16b, v26.16b
   3225 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   3226 
   3227 	aese	v3.16b, v26.16b
   3228 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   3229 
   3230 	aese	v1.16b, v25.16b
   3231 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   3232 
   3233 	aese	v2.16b, v27.16b
   3234 	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
   3235 
   3236 	aese	v3.16b, v27.16b
   3237 	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
   3238 
   3239 	aese	v1.16b, v26.16b
   3240 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   3241 	sub	x5, x5, #1      //byte_len - 1
   3242 
   3243 	aese	v0.16b, v26.16b
   3244 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   3245 	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   3246 
   3247 	aese	v3.16b, v28.16b
   3248 	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
   3249 	add	x5, x5, x0
   3250 
   3251 	aese	v1.16b, v27.16b
   3252 	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
   3253 	cmp	x0, x5                   //check if we have <= 4 blocks
   3254 
   3255 	aese	v0.16b, v27.16b
   3256 	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
   3257 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   3258 
   3259 	aese	v3.16b, v29.16b                                     //AES block 3 - round 11
   3260 
   3261 	aese	v2.16b, v28.16b
   3262 	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
   3263 
   3264 	aese	v1.16b, v28.16b
   3265 	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
   3266 
   3267 	aese	v0.16b, v28.16b
   3268 	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
   3269 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   3270 
   3271 	aese	v2.16b, v29.16b                                     //AES block 2 - round 11
   3272 
   3273 	aese	v1.16b, v29.16b                                     //AES block 1 - round 11
   3274 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   3275 
   3276 	aese	v0.16b, v29.16b                                     //AES block 0 - round 11
   3277 	b.ge	.L192_dec_tail                                    //handle tail
   3278 
   3279 	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
   3280 
   3281 	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
   3282 
   3283 	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
   3284 	rev	w9, w12                                 //CTR block 4
   3285 	ld1	{v6.16b, v7.16b}, [x0], #32               //AES block 2,3 - load ciphertext
   3286 
   3287 	mov	x19, v1.d[0]                            //AES block 1 - mov low
   3288 
   3289 	mov	x20, v1.d[1]                            //AES block 1 - mov high
   3290 
   3291 	mov	x6, v0.d[0]                            //AES block 0 - mov low
   3292 	orr	x9, x11, x9, lsl #32            //CTR block 4
   3293 	add	w12, w12, #1                            //CTR block 4
   3294 
   3295 	mov	x7, v0.d[1]                            //AES block 0 - mov high
   3296 	rev64	v4.16b, v4.16b                                    //GHASH block 0
   3297 
   3298 	fmov	d0, x10                               //CTR block 4
   3299 	rev64	v5.16b, v5.16b                                    //GHASH block 1
   3300 	cmp	x0, x5                   //check if we have <= 8 blocks
   3301 
   3302 	eor	x19, x19, x13                   //AES block 1 - round 12 low
   3303 #ifdef __AARCH64EB__
   3304 	rev	x19, x19
   3305 #endif
   3306 	fmov	v0.d[1], x9                               //CTR block 4
   3307 	rev	w9, w12                                 //CTR block 5
   3308 
   3309 	orr	x9, x11, x9, lsl #32            //CTR block 5
   3310 	fmov	d1, x10                               //CTR block 5
   3311 	eor	x20, x20, x14                   //AES block 1 - round 12 high
   3312 #ifdef __AARCH64EB__
   3313 	rev	x20, x20
   3314 #endif
   3315 	add	w12, w12, #1                            //CTR block 5
   3316 	fmov	v1.d[1], x9                               //CTR block 5
   3317 	eor	x6, x6, x13                   //AES block 0 - round 12 low
   3318 #ifdef __AARCH64EB__
   3319 	rev	x6, x6
   3320 #endif
   3321 	rev	w9, w12                                 //CTR block 6
   3322 	eor	x7, x7, x14                   //AES block 0 - round 12 high
   3323 #ifdef __AARCH64EB__
   3324 	rev	x7, x7
   3325 #endif
   3326 	stp	x6, x7, [x2], #16        //AES block 0 - store result
   3327 	orr	x9, x11, x9, lsl #32            //CTR block 6
   3328 
   3329 	stp	x19, x20, [x2], #16        //AES block 1 - store result
   3330 
   3331 	add	w12, w12, #1                            //CTR block 6
   3332 	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
   3333 	b.ge	.L192_dec_prepretail                              //do prepretail
   3334 
   3335 .L192_dec_main_loop:	//main	loop start
   3336 	aese	v1.16b, v18.16b
   3337 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   3338 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   3339 
   3340 	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   3341 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   3342 
   3343 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   3344 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   3345 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   3346 
   3347 	aese	v1.16b, v19.16b
   3348 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   3349 	fmov	d2, x10                               //CTR block 4k+6
   3350 
   3351 	aese	v0.16b, v18.16b
   3352 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   3353 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   3354 
   3355 	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   3356 	fmov	v2.d[1], x9                               //CTR block 4k+6
   3357 
   3358 	aese	v1.16b, v20.16b
   3359 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   3360 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   3361 
   3362 	aese	v0.16b, v19.16b
   3363 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   3364 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   3365 
   3366 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   3367 	fmov	d3, x10                               //CTR block 4k+7
   3368 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   3369 
   3370 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   3371 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   3372 	rev	w9, w12                                 //CTR block 4k+7
   3373 
   3374 	aese	v2.16b, v18.16b
   3375 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   3376 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   3377 
   3378 	fmov	v3.d[1], x9                               //CTR block 4k+7
   3379 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   3380 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   3381 
   3382 	aese	v1.16b, v21.16b
   3383 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   3384 
   3385 	aese	v0.16b, v20.16b
   3386 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   3387 	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
   3388 #ifdef __AARCH64EB__
   3389 	rev	x22, x22
   3390 #endif
   3391 	aese	v2.16b, v19.16b
   3392 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   3393 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   3394 
   3395 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   3396 
   3397 	aese	v3.16b, v18.16b
   3398 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   3399 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   3400 
   3401 	aese	v2.16b, v20.16b
   3402 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   3403 
   3404 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   3405 	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
   3406 	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
   3407 #ifdef __AARCH64EB__
   3408 	rev	x21, x21
   3409 #endif
   3410 	aese	v1.16b, v22.16b
   3411 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   3412 
   3413 	aese	v0.16b, v21.16b
   3414 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   3415 
   3416 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   3417 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   3418 
   3419 	aese	v3.16b, v19.16b
   3420 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   3421 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
   3422 
   3423 	aese	v0.16b, v22.16b
   3424 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   3425 
   3426 	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   3427 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   3428 
   3429 	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   3430 
   3431 	aese	v0.16b, v23.16b
   3432 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   3433 
   3434 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
   3435 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   3436 
   3437 	aese	v1.16b, v23.16b
   3438 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   3439 
   3440 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   3441 
   3442 	aese	v3.16b, v20.16b
   3443 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   3444 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   3445 
   3446 	aese	v1.16b, v24.16b
   3447 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   3448 
   3449 	aese	v0.16b, v24.16b
   3450 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   3451 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   3452 
   3453 	aese	v3.16b, v21.16b
   3454 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   3455 
   3456 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   3457 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
   3458 
   3459 	aese	v0.16b, v25.16b
   3460 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   3461 
   3462 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   3463 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   3464 
   3465 	aese	v1.16b, v25.16b
   3466 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   3467 
   3468 	aese	v0.16b, v26.16b
   3469 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   3470 	movi	v8.8b, #0xc2
   3471 
   3472 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   3473 
   3474 	aese	v1.16b, v26.16b
   3475 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   3476 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   3477 
   3478 	aese	v2.16b, v21.16b
   3479 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   3480 
   3481 	aese	v0.16b, v27.16b
   3482 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   3483 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   3484 
   3485 	aese	v3.16b, v22.16b
   3486 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   3487 
   3488 	aese	v2.16b, v22.16b
   3489 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   3490 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   3491 
   3492 	aese	v0.16b, v28.16b
   3493 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   3494 
   3495 	aese	v1.16b, v27.16b
   3496 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   3497 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   3498 
   3499 	aese	v2.16b, v23.16b
   3500 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   3501 
   3502 	aese	v3.16b, v23.16b
   3503 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   3504 	shl	d8, d8, #56               //mod_constant
   3505 
   3506 	aese	v1.16b, v28.16b
   3507 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   3508 
   3509 	aese	v2.16b, v24.16b
   3510 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   3511 	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
   3512 
   3513 	aese	v3.16b, v24.16b
   3514 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   3515 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   3516 
   3517 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   3518 	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
   3519 	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
   3520 #ifdef __AARCH64EB__
   3521 	rev	x23, x23
   3522 #endif
   3523 	aese	v2.16b, v25.16b
   3524 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   3525 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   3526 
   3527 	aese	v0.16b, v29.16b                                     //AES block 4k+4 - round 11
   3528 	add	w12, w12, #1                            //CTR block 4k+7
   3529 
   3530 	aese	v3.16b, v25.16b
   3531 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   3532 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   3533 
   3534 	aese	v2.16b, v26.16b
   3535 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   3536 	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
   3537 
   3538 	aese	v1.16b, v29.16b                                     //AES block 4k+5 - round 11
   3539 	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
   3540 	rev	w9, w12                                 //CTR block 4k+8
   3541 
   3542 	aese	v3.16b, v26.16b
   3543 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   3544 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   3545 
   3546 	aese	v2.16b, v27.16b
   3547 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   3548 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   3549 
   3550 	cmp	x0, x5                   //.LOOP CONTROL
   3551 
   3552 	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
   3553 	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
   3554 #ifdef __AARCH64EB__
   3555 	rev	x24, x24
   3556 #endif
   3557 	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
   3558 
   3559 	aese	v2.16b, v28.16b
   3560 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   3561 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   3562 
   3563 	aese	v3.16b, v27.16b
   3564 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   3565 
   3566 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   3567 	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
   3568 
   3569 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   3570 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   3571 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
   3572 
   3573 	aese	v2.16b, v29.16b                                     //AES block 4k+6 - round 11
   3574 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   3575 
   3576 	aese	v3.16b, v28.16b
   3577 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   3578 	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
   3579 
   3580 	fmov	d0, x10                               //CTR block 4k+8
   3581 	add	w12, w12, #1                            //CTR block 4k+8
   3582 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   3583 
   3584 	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
   3585 	fmov	v0.d[1], x9                               //CTR block 4k+8
   3586 	rev	w9, w12                                 //CTR block 4k+9
   3587 
   3588 	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
   3589 #ifdef __AARCH64EB__
   3590 	rev	x6, x6
   3591 #endif
   3592 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   3593 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   3594 
   3595 	fmov	d1, x10                               //CTR block 4k+9
   3596 	add	w12, w12, #1                            //CTR block 4k+9
   3597 	eor	x19, x19, x13                   //AES block 4k+5 - round 12 low
   3598 #ifdef __AARCH64EB__
   3599 	rev	x19, x19
   3600 #endif
   3601 	fmov	v1.d[1], x9                               //CTR block 4k+9
   3602 	rev	w9, w12                                 //CTR block 4k+10
   3603 	eor	x20, x20, x14                   //AES block 4k+5 - round 12 high
   3604 #ifdef __AARCH64EB__
   3605 	rev	x20, x20
   3606 #endif
   3607 	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
   3608 #ifdef __AARCH64EB__
   3609 	rev	x7, x7
   3610 #endif
   3611 	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
   3612 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   3613 
   3614 	add	w12, w12, #1                            //CTR block 4k+10
   3615 	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
   3616 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   3617 
   3618 	aese	v3.16b, v29.16b                                     //AES block 4k+7 - round 11
   3619 	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
   3620 	b.lt	.L192_dec_main_loop
   3621 
   3622 .L192_dec_prepretail:	//PREPRETAIL
   3623 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   3624 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   3625 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   3626 
   3627 	aese	v1.16b, v18.16b
   3628 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   3629 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   3630 
   3631 	aese	v0.16b, v18.16b
   3632 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   3633 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   3634 
   3635 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   3636 	fmov	d2, x10                               //CTR block 4k+6
   3637 
   3638 	aese	v1.16b, v19.16b
   3639 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   3640 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   3641 
   3642 	aese	v0.16b, v19.16b
   3643 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   3644 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   3645 
   3646 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   3647 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   3648 	fmov	d3, x10                               //CTR block 4k+7
   3649 
   3650 	aese	v1.16b, v20.16b
   3651 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   3652 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   3653 
   3654 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   3655 	fmov	v2.d[1], x9                               //CTR block 4k+6
   3656 	rev	w9, w12                                 //CTR block 4k+7
   3657 
   3658 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   3659 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   3660 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   3661 
   3662 	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   3663 	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
   3664 #ifdef __AARCH64EB__
   3665 	rev	x24, x24
   3666 #endif
   3667 	fmov	v3.d[1], x9                               //CTR block 4k+7
   3668 
   3669 	aese	v0.16b, v20.16b
   3670 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   3671 	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
   3672 #ifdef __AARCH64EB__
   3673 	rev	x21, x21
   3674 #endif
   3675 	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   3676 	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
   3677 #ifdef __AARCH64EB__
   3678 	rev	x22, x22
   3679 #endif
   3680 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   3681 
   3682 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   3683 	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
   3684 #ifdef __AARCH64EB__
   3685 	rev	x23, x23
   3686 #endif
   3687 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   3688 
   3689 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   3690 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   3691 
   3692 	aese	v3.16b, v18.16b
   3693 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   3694 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
   3695 
   3696 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   3697 	add	w12, w12, #1                            //CTR block 4k+7
   3698 
   3699 	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   3700 	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
   3701 
   3702 	aese	v2.16b, v18.16b
   3703 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   3704 
   3705 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   3706 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   3707 
   3708 	aese	v3.16b, v19.16b
   3709 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   3710 
   3711 	aese	v2.16b, v19.16b
   3712 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   3713 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
   3714 
   3715 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   3716 
   3717 	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   3718 
   3719 	aese	v2.16b, v20.16b
   3720 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   3721 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   3722 
   3723 	aese	v3.16b, v20.16b
   3724 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   3725 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   3726 
   3727 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   3728 
   3729 	aese	v0.16b, v21.16b
   3730 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   3731 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   3732 
   3733 	aese	v1.16b, v21.16b
   3734 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   3735 
   3736 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   3737 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
   3738 
   3739 	aese	v0.16b, v22.16b
   3740 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   3741 
   3742 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   3743 	movi	v8.8b, #0xc2
   3744 
   3745 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   3746 
   3747 	aese	v2.16b, v21.16b
   3748 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   3749 
   3750 	shl	d8, d8, #56               //mod_constant
   3751 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   3752 
   3753 	aese	v0.16b, v23.16b
   3754 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   3755 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   3756 
   3757 	aese	v2.16b, v22.16b
   3758 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   3759 
   3760 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   3761 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   3762 
   3763 	aese	v0.16b, v24.16b
   3764 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   3765 
   3766 	aese	v3.16b, v21.16b
   3767 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   3768 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   3769 
   3770 	aese	v2.16b, v23.16b
   3771 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   3772 
   3773 	aese	v0.16b, v25.16b
   3774 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   3775 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   3776 
   3777 	aese	v3.16b, v22.16b
   3778 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   3779 
   3780 	aese	v2.16b, v24.16b
   3781 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   3782 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   3783 
   3784 	aese	v0.16b, v26.16b
   3785 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   3786 
   3787 	aese	v3.16b, v23.16b
   3788 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   3789 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   3790 
   3791 	aese	v1.16b, v22.16b
   3792 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   3793 
   3794 	aese	v2.16b, v25.16b
   3795 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   3796 
   3797 	aese	v0.16b, v27.16b
   3798 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   3799 
   3800 	aese	v1.16b, v23.16b
   3801 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   3802 
   3803 	aese	v3.16b, v24.16b
   3804 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   3805 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   3806 
   3807 	aese	v0.16b, v28.16b
   3808 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   3809 
   3810 	aese	v1.16b, v24.16b
   3811 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   3812 
   3813 	aese	v3.16b, v25.16b
   3814 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   3815 
   3816 	aese	v2.16b, v26.16b
   3817 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   3818 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   3819 
   3820 	aese	v1.16b, v25.16b
   3821 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   3822 
   3823 	aese	v3.16b, v26.16b
   3824 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   3825 
   3826 	aese	v2.16b, v27.16b
   3827 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   3828 
   3829 	aese	v1.16b, v26.16b
   3830 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   3831 
   3832 	aese	v3.16b, v27.16b
   3833 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   3834 
   3835 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   3836 
   3837 	aese	v1.16b, v27.16b
   3838 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   3839 
   3840 	aese	v2.16b, v28.16b
   3841 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   3842 
   3843 	aese	v3.16b, v28.16b
   3844 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   3845 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   3846 
   3847 	aese	v1.16b, v28.16b
   3848 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   3849 
   3850 	aese	v0.16b, v29.16b
   3851 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   3852 
   3853 	aese	v2.16b, v29.16b
   3854 
   3855 	aese	v1.16b, v29.16b
   3856 
   3857 	aese	v3.16b, v29.16b
   3858 
   3859 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   3860 .L192_dec_tail:	//TAIL
   3861 
   3862 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   3863 	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
   3864 
   3865 	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
   3866 
   3867 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   3868 
   3869 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   3870 
   3871 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   3872 
   3873 	cmp	x5, #48
   3874 
   3875 	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
   3876 #ifdef __AARCH64EB__
   3877 	rev	x7, x7
   3878 #endif
   3879 	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
   3880 #ifdef __AARCH64EB__
   3881 	rev	x6, x6
   3882 #endif
   3883 	b.gt	.L192_dec_blocks_more_than_3
   3884 
   3885 	movi	v11.8b, #0
   3886 	movi	v9.8b, #0
   3887 
   3888 	mov	v3.16b, v2.16b
   3889 	mov	v2.16b, v1.16b
   3890 	sub	w12, w12, #1
   3891 
   3892 	movi	v10.8b, #0
   3893 	cmp	x5, #32
   3894 	b.gt	.L192_dec_blocks_more_than_2
   3895 
   3896 	mov	v3.16b, v1.16b
   3897 	cmp	x5, #16
   3898 	sub	w12, w12, #1
   3899 
   3900 	b.gt	.L192_dec_blocks_more_than_1
   3901 
   3902 	sub	w12, w12, #1
   3903 	b	.L192_dec_blocks_less_than_1
   3904 .L192_dec_blocks_more_than_3:	//blocks	left >  3
   3905 	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
   3906 	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
   3907 
   3908 	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
   3909 
   3910 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   3911 
   3912 	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
   3913 
   3914 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
   3915 	mov	x6, v0.d[0]                            //AES final-2 block - mov low
   3916 	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
   3917 
   3918 	mov	x7, v0.d[1]                            //AES final-2 block - mov high
   3919 
   3920 	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
   3921 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
   3922 
   3923 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
   3924 
   3925 	eor	x6, x6, x13                   //AES final-2 block - round 12 low
   3926 #ifdef __AARCH64EB__
   3927 	rev	x6, x6
   3928 #endif
   3929 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   3930 
   3931 	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
   3932 	eor	x7, x7, x14                   //AES final-2 block - round 12 high
   3933 #ifdef __AARCH64EB__
   3934 	rev	x7, x7
   3935 #endif
   3936 .L192_dec_blocks_more_than_2:	//blocks	left >  2
   3937 
   3938 	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
   3939 	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
   3940 
   3941 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   3942 
   3943 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   3944 
   3945 	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
   3946 
   3947 	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
   3948 
   3949 	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
   3950 
   3951 	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
   3952 
   3953 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
   3954 	mov	x7, v0.d[1]                            //AES final-1 block - mov high
   3955 
   3956 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
   3957 	mov	x6, v0.d[0]                            //AES final-1 block - mov low
   3958 
   3959 	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
   3960 
   3961 	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
   3962 
   3963 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
   3964 	eor	x7, x7, x14                   //AES final-1 block - round 12 high
   3965 #ifdef __AARCH64EB__
   3966 	rev	x7, x7
   3967 #endif
   3968 	eor	x6, x6, x13                   //AES final-1 block - round 12 low
   3969 #ifdef __AARCH64EB__
   3970 	rev	x6, x6
   3971 #endif
   3972 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
   3973 .L192_dec_blocks_more_than_1:	//blocks	left >  1
   3974 
   3975 	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
   3976 
   3977 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   3978 	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
   3979 
   3980 	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
   3981 
   3982 	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
   3983 
   3984 	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
   3985 	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
   3986 
   3987 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
   3988 
   3989 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
   3990 
   3991 	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
   3992 	mov	x7, v0.d[1]                            //AES final block - mov high
   3993 
   3994 	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
   3995 	mov	x6, v0.d[0]                            //AES final block - mov low
   3996 
   3997 	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
   3998 
   3999 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   4000 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
   4001 	eor	x7, x7, x14                   //AES final block - round 12 high
   4002 #ifdef __AARCH64EB__
   4003 	rev	x7, x7
   4004 #endif
   4005 	eor	x6, x6, x13                   //AES final block - round 12 low
   4006 #ifdef __AARCH64EB__
   4007 	rev	x6, x6
   4008 #endif
   4009 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
   4010 .L192_dec_blocks_less_than_1:	//blocks	left <= 1
   4011 
   4012 	mvn	x13, xzr                                      //rk12_l = 0xffffffffffffffff
   4013 	ldp	x4, x5, [x2]  //load existing bytes we need to not overwrite
   4014 	and	x1, x1, #127                    //bit_length %= 128
   4015 
   4016 	sub	x1, x1, #128                    //bit_length -= 128
   4017 
   4018 	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
   4019 
   4020 	and	x1, x1, #127                    //bit_length %= 128
   4021 	mvn	x14, xzr                                      //rk12_h = 0xffffffffffffffff
   4022 
   4023 	lsr	x14, x14, x1                     //rk12_h is mask for top 64b of last block
   4024 	cmp	x1, #64
   4025 
   4026 	csel	x9, x13, x14, lt
   4027 	csel	x10, x14, xzr, lt
   4028 
   4029 	fmov	d0, x9                                   //ctr0b is mask for last block
   4030 	and	x6, x6, x9
   4031 	bic	x4, x4, x9           //mask out low existing bytes
   4032 
   4033 	orr	x6, x6, x4
   4034 	mov	v0.d[1], x10
   4035 #ifndef __AARCH64EB__
   4036 	rev	w9, w12
   4037 #else
   4038 	mov	w9, w12
   4039 #endif
   4040 
   4041 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
   4042 	str	w9, [x16, #12]                          //store the updated counter
   4043 
   4044 	rev64	v4.16b, v5.16b                                    //GHASH final block
   4045 
   4046 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   4047 	bic	x5, x5, x10 //mask out high existing bytes
   4048 
   4049 	and	x7, x7, x10
   4050 
   4051 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
   4052 	mov	d8, v4.d[1]                                  //GHASH final block - mid
   4053 
   4054 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
   4055 
   4056 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
   4057 
   4058 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
   4059 
   4060 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
   4061 
   4062 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
   4063 
   4064 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
   4065 	movi	v8.8b, #0xc2
   4066 
   4067 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   4068 
   4069 	shl	d8, d8, #56               //mod_constant
   4070 
   4071 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   4072 
   4073 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   4074 	orr	x7, x7, x5
   4075 	stp	x6, x7, [x2]
   4076 
   4077 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   4078 
   4079 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   4080 
   4081 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   4082 
   4083 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   4084 
   4085 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   4086 
   4087 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   4088 
   4089 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   4090 	ext	v11.16b, v11.16b, v11.16b, #8
   4091 	rev64	v11.16b, v11.16b
   4092 	mov	x0, x15
   4093 	st1	{ v11.16b }, [x3]
   4094 
   4095 	ldp	x21, x22, [sp, #16]
   4096 	ldp	x23, x24, [sp, #32]
   4097 	ldp	d8, d9, [sp, #48]
   4098 	ldp	d10, d11, [sp, #64]
   4099 	ldp	d12, d13, [sp, #80]
   4100 	ldp	d14, d15, [sp, #96]
   4101 	ldp	x19, x20, [sp], #112
   4102 	ret
   4103 
   4104 .L192_dec_ret:
   4105 	mov	w0, #0x0
   4106 	ret
   4107 .size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
   4108 .globl	aes_gcm_enc_256_kernel
   4109 .type	aes_gcm_enc_256_kernel,%function
   4110 .align	4
   4111 aes_gcm_enc_256_kernel:
   4112 	AARCH64_VALID_CALL_TARGET
   4113 	cbz	x1, .L256_enc_ret
   4114 	stp	x19, x20, [sp, #-112]!
   4115 	mov	x16, x4
   4116 	mov	x8, x5
   4117 	stp	x21, x22, [sp, #16]
   4118 	stp	x23, x24, [sp, #32]
   4119 	stp	d8, d9, [sp, #48]
   4120 	stp	d10, d11, [sp, #64]
   4121 	stp	d12, d13, [sp, #80]
   4122 	stp	d14, d15, [sp, #96]
   4123 
   4124 	add	x4, x0, x1, lsr #3   //end_input_ptr
   4125 	lsr	x5, x1, #3              //byte_len
   4126 	mov	x15, x5
   4127 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   4128 #ifdef __AARCH64EB__
   4129 	rev	x10, x10
   4130 	rev	x11, x11
   4131 #endif
   4132 	ldp	x13, x14, [x8, #224]                     //load rk14
   4133 #ifdef __AARCH64EB__
   4134 	ror	x13, x13, #32
   4135 	ror	x14, x14, #32
   4136 #endif
   4137 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   4138 	sub	x5, x5, #1      //byte_len - 1
   4139 
   4140 	ld1	{v18.4s}, [x8], #16                               //load rk0
   4141 	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   4142 
   4143 	ld1	{v19.4s}, [x8], #16                               //load rk1
   4144 	add	x5, x5, x0
   4145 
   4146 	lsr	x12, x11, #32
   4147 	fmov	d2, x10                               //CTR block 2
   4148 	orr	w11, w11, w11
   4149 
   4150 	rev	w12, w12                                //rev_ctr32
   4151 	cmp	x0, x5                   //check if we have <= 4 blocks
   4152 	fmov	d1, x10                               //CTR block 1
   4153 
   4154 	aese	v0.16b, v18.16b
   4155 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   4156 	add	w12, w12, #1                            //increment rev_ctr32
   4157 
   4158 	rev	w9, w12                                 //CTR block 1
   4159 	fmov	d3, x10                               //CTR block 3
   4160 
   4161 	orr	x9, x11, x9, lsl #32            //CTR block 1
   4162 	add	w12, w12, #1                            //CTR block 1
   4163 	ld1	{v20.4s}, [x8], #16                               //load rk2
   4164 
   4165 	fmov	v1.d[1], x9                               //CTR block 1
   4166 	rev	w9, w12                                 //CTR block 2
   4167 	add	w12, w12, #1                            //CTR block 2
   4168 
   4169 	orr	x9, x11, x9, lsl #32            //CTR block 2
   4170 	ld1	{v21.4s}, [x8], #16                               //load rk3
   4171 
   4172 	fmov	v2.d[1], x9                               //CTR block 2
   4173 	rev	w9, w12                                 //CTR block 3
   4174 
   4175 	aese	v0.16b, v19.16b
   4176 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   4177 	orr	x9, x11, x9, lsl #32            //CTR block 3
   4178 
   4179 	fmov	v3.d[1], x9                               //CTR block 3
   4180 
   4181 	aese	v1.16b, v18.16b
   4182 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   4183 	ld1	{v22.4s}, [x8], #16                               //load rk4
   4184 
   4185 	aese	v0.16b, v20.16b
   4186 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   4187 	ld1	{v23.4s}, [x8], #16                               //load rk5
   4188 
   4189 	aese	v2.16b, v18.16b
   4190 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   4191 	ld1	{v24.4s}, [x8], #16                               //load rk6
   4192 
   4193 	aese	v1.16b, v19.16b
   4194 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   4195 	ldr	q14, [x3, #80]                         //load h3l | h3h
   4196 #ifndef __AARCH64EB__
   4197 	ext	v14.16b, v14.16b, v14.16b, #8
   4198 #endif
   4199 	aese	v3.16b, v18.16b
   4200 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   4201 	ld1	{v25.4s}, [x8], #16                               //load rk7
   4202 
   4203 	aese	v2.16b, v19.16b
   4204 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   4205 	ld1	{v26.4s}, [x8], #16                               //load rk8
   4206 
   4207 	aese	v1.16b, v20.16b
   4208 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   4209 	ldr	q13, [x3, #64]                         //load h2l | h2h
   4210 #ifndef __AARCH64EB__
   4211 	ext	v13.16b, v13.16b, v13.16b, #8
   4212 #endif
   4213 	aese	v3.16b, v19.16b
   4214 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   4215 	ld1	{v27.4s}, [x8], #16                               //load rk9
   4216 
   4217 	aese	v2.16b, v20.16b
   4218 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   4219 	ldr	q15, [x3, #112]                        //load h4l | h4h
   4220 #ifndef __AARCH64EB__
   4221 	ext	v15.16b, v15.16b, v15.16b, #8
   4222 #endif
   4223 	aese	v1.16b, v21.16b
   4224 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   4225 	ld1	{v28.4s}, [x8], #16                              //load rk10
   4226 
   4227 	aese	v3.16b, v20.16b
   4228 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   4229 	ld1	{v29.4s}, [x8], #16                              //load rk11
   4230 
   4231 	aese	v2.16b, v21.16b
   4232 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   4233 	add	w12, w12, #1                            //CTR block 3
   4234 
   4235 	aese	v0.16b, v21.16b
   4236 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   4237 
   4238 	aese	v3.16b, v21.16b
   4239 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   4240 	ld1	{ v11.16b}, [x3]
   4241 	ext	v11.16b, v11.16b, v11.16b, #8
   4242 	rev64	v11.16b, v11.16b
   4243 
   4244 	aese	v2.16b, v22.16b
   4245 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   4246 
   4247 	aese	v0.16b, v22.16b
   4248 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   4249 
   4250 	aese	v1.16b, v22.16b
   4251 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   4252 
   4253 	aese	v3.16b, v22.16b
   4254 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   4255 
   4256 	aese	v0.16b, v23.16b
   4257 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   4258 
   4259 	aese	v1.16b, v23.16b
   4260 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   4261 
   4262 	aese	v3.16b, v23.16b
   4263 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   4264 
   4265 	aese	v2.16b, v23.16b
   4266 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   4267 
   4268 	aese	v1.16b, v24.16b
   4269 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   4270 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   4271 
   4272 	aese	v3.16b, v24.16b
   4273 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   4274 	ld1	{v30.4s}, [x8], #16                              //load rk12
   4275 
   4276 	aese	v0.16b, v24.16b
   4277 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   4278 	ldr	q12, [x3, #32]                         //load h1l | h1h
   4279 #ifndef __AARCH64EB__
   4280 	ext	v12.16b, v12.16b, v12.16b, #8
   4281 #endif
   4282 	aese	v2.16b, v24.16b
   4283 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   4284 	ld1	{v31.4s}, [x8], #16                              //load rk13
   4285 
   4286 	aese	v1.16b, v25.16b
   4287 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   4288 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   4289 
   4290 	aese	v0.16b, v25.16b
   4291 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   4292 
   4293 	aese	v2.16b, v25.16b
   4294 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   4295 
   4296 	aese	v3.16b, v25.16b
   4297 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   4298 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   4299 
   4300 	aese	v1.16b, v26.16b
   4301 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   4302 
   4303 	aese	v2.16b, v26.16b
   4304 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   4305 
   4306 	aese	v3.16b, v26.16b
   4307 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   4308 
   4309 	aese	v1.16b, v27.16b
   4310 	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
   4311 
   4312 	aese	v2.16b, v27.16b
   4313 	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
   4314 
   4315 	aese	v0.16b, v26.16b
   4316 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   4317 
   4318 	aese	v1.16b, v28.16b
   4319 	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
   4320 
   4321 	aese	v3.16b, v27.16b
   4322 	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
   4323 
   4324 	aese	v0.16b, v27.16b
   4325 	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
   4326 
   4327 	aese	v2.16b, v28.16b
   4328 	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
   4329 
   4330 	aese	v3.16b, v28.16b
   4331 	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
   4332 
   4333 	aese	v1.16b, v29.16b
   4334 	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
   4335 
   4336 	aese	v2.16b, v29.16b
   4337 	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
   4338 
   4339 	aese	v0.16b, v28.16b
   4340 	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
   4341 
   4342 	aese	v1.16b, v30.16b
   4343 	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
   4344 
   4345 	aese	v2.16b, v30.16b
   4346 	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
   4347 
   4348 	aese	v0.16b, v29.16b
   4349 	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
   4350 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   4351 
   4352 	aese	v3.16b, v29.16b
   4353 	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
   4354 
   4355 	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
   4356 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   4357 
   4358 	aese	v0.16b, v30.16b
   4359 	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
   4360 
   4361 	aese	v3.16b, v30.16b
   4362 	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
   4363 
   4364 	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
   4365 
   4366 	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
   4367 
   4368 	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
   4369 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   4370 	b.ge	.L256_enc_tail                                    //handle tail
   4371 
   4372 	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
   4373 #ifdef __AARCH64EB__
   4374 	rev	x19, x19
   4375 	rev	x20, x20
   4376 #endif
   4377 	rev	w9, w12                                 //CTR block 4
   4378 	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
   4379 #ifdef __AARCH64EB__
   4380 	rev	x6, x6
   4381 	rev	x7, x7
   4382 #endif
   4383 	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
   4384 #ifdef __AARCH64EB__
   4385 	rev	x23, x23
   4386 	rev	x24, x24
   4387 #endif
   4388 	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
   4389 #ifdef __AARCH64EB__
   4390 	rev	x21, x21
   4391 	rev	x22, x22
   4392 #endif
   4393 	add	x0, x0, #64                       //AES input_ptr update
   4394 
   4395 	eor	x19, x19, x13                     //AES block 1 - round 14 low
   4396 	eor	x20, x20, x14                     //AES block 1 - round 14 high
   4397 
   4398 	fmov	d5, x19                               //AES block 1 - mov low
   4399 	eor	x6, x6, x13                     //AES block 0 - round 14 low
   4400 
   4401 	eor	x7, x7, x14                     //AES block 0 - round 14 high
   4402 	eor	x24, x24, x14                     //AES block 3 - round 14 high
   4403 	fmov	d4, x6                               //AES block 0 - mov low
   4404 
   4405 	cmp	x0, x5                   //check if we have <= 8 blocks
   4406 	fmov	v4.d[1], x7                           //AES block 0 - mov high
   4407 	eor	x23, x23, x13                     //AES block 3 - round 14 low
   4408 
   4409 	eor	x21, x21, x13                     //AES block 2 - round 14 low
   4410 	fmov	v5.d[1], x20                           //AES block 1 - mov high
   4411 
   4412 	fmov	d6, x21                               //AES block 2 - mov low
   4413 	add	w12, w12, #1                            //CTR block 4
   4414 
   4415 	orr	x9, x11, x9, lsl #32            //CTR block 4
   4416 	fmov	d7, x23                               //AES block 3 - mov low
   4417 	eor	x22, x22, x14                     //AES block 2 - round 14 high
   4418 
   4419 	fmov	v6.d[1], x22                           //AES block 2 - mov high
   4420 
   4421 	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
   4422 	fmov	d0, x10                               //CTR block 4
   4423 
   4424 	fmov	v0.d[1], x9                               //CTR block 4
   4425 	rev	w9, w12                                 //CTR block 5
   4426 	add	w12, w12, #1                            //CTR block 5
   4427 
   4428 	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
   4429 	fmov	d1, x10                               //CTR block 5
   4430 	orr	x9, x11, x9, lsl #32            //CTR block 5
   4431 
   4432 	fmov	v1.d[1], x9                               //CTR block 5
   4433 	rev	w9, w12                                 //CTR block 6
   4434 	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
   4435 
   4436 	fmov	v7.d[1], x24                           //AES block 3 - mov high
   4437 	orr	x9, x11, x9, lsl #32            //CTR block 6
   4438 	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
   4439 
   4440 	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
   4441 
   4442 	add	w12, w12, #1                            //CTR block 6
   4443 	fmov	d2, x10                               //CTR block 6
   4444 
   4445 	fmov	v2.d[1], x9                               //CTR block 6
   4446 	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
   4447 	rev	w9, w12                                 //CTR block 7
   4448 
   4449 	orr	x9, x11, x9, lsl #32            //CTR block 7
   4450 
   4451 	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
   4452 	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
   4453 	b.ge	.L256_enc_prepretail                               //do prepretail
   4454 
   4455 .L256_enc_main_loop:	//main	loop start
   4456 	aese	v0.16b, v18.16b
   4457 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   4458 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
   4459 
   4460 	aese	v1.16b, v18.16b
   4461 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   4462 	fmov	d3, x10                               //CTR block 4k+3
   4463 
   4464 	aese	v2.16b, v18.16b
   4465 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   4466 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   4467 
   4468 	aese	v0.16b, v19.16b
   4469 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   4470 	fmov	v3.d[1], x9                               //CTR block 4k+3
   4471 
   4472 	aese	v1.16b, v19.16b
   4473 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   4474 	ldp	x23, x24, [x0, #48]           //AES block 4k+7 - load plaintext
   4475 #ifdef __AARCH64EB__
   4476 	rev	x23, x23
   4477 	rev	x24, x24
   4478 #endif
   4479 	aese	v2.16b, v19.16b
   4480 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   4481 	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
   4482 #ifdef __AARCH64EB__
   4483 	rev	x21, x21
   4484 	rev	x22, x22
   4485 #endif
   4486 	aese	v0.16b, v20.16b
   4487 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   4488 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   4489 
   4490 	aese	v1.16b, v20.16b
   4491 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   4492 
   4493 	aese	v3.16b, v18.16b
   4494 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   4495 	eor	x23, x23, x13                     //AES block 4k+7 - round 14 low
   4496 
   4497 	aese	v0.16b, v21.16b
   4498 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   4499 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   4500 
   4501 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   4502 	eor	x22, x22, x14                     //AES block 4k+6 - round 14 high
   4503 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   4504 
   4505 	aese	v3.16b, v19.16b
   4506 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   4507 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
   4508 
   4509 	aese	v0.16b, v22.16b
   4510 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   4511 
   4512 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   4513 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   4514 
   4515 	aese	v2.16b, v20.16b
   4516 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   4517 
   4518 	aese	v0.16b, v23.16b
   4519 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   4520 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4521 
   4522 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   4523 
   4524 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   4525 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
   4526 
   4527 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   4528 
   4529 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   4530 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   4531 
   4532 	aese	v1.16b, v21.16b
   4533 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   4534 
   4535 	aese	v3.16b, v20.16b
   4536 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   4537 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   4538 
   4539 	aese	v2.16b, v21.16b
   4540 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   4541 
   4542 	aese	v1.16b, v22.16b
   4543 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   4544 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   4545 
   4546 	aese	v3.16b, v21.16b
   4547 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   4548 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   4549 
   4550 	aese	v2.16b, v22.16b
   4551 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   4552 
   4553 	aese	v0.16b, v24.16b
   4554 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   4555 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   4556 
   4557 	aese	v3.16b, v22.16b
   4558 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   4559 
   4560 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   4561 
   4562 	aese	v0.16b, v25.16b
   4563 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   4564 
   4565 	aese	v3.16b, v23.16b
   4566 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   4567 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   4568 
   4569 	aese	v1.16b, v23.16b
   4570 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   4571 
   4572 	aese	v0.16b, v26.16b
   4573 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   4574 
   4575 	aese	v2.16b, v23.16b
   4576 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   4577 
   4578 	aese	v1.16b, v24.16b
   4579 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   4580 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   4581 
   4582 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   4583 
   4584 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   4585 
   4586 	aese	v1.16b, v25.16b
   4587 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   4588 
   4589 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   4590 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   4591 
   4592 	aese	v3.16b, v24.16b
   4593 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   4594 	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
   4595 #ifdef __AARCH64EB__
   4596 	rev	x19, x19
   4597 	rev	x20, x20
   4598 #endif
   4599 	aese	v1.16b, v26.16b
   4600 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   4601 	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
   4602 
   4603 	aese	v2.16b, v24.16b
   4604 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   4605 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   4606 
   4607 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   4608 
   4609 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   4610 	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
   4611 
   4612 	aese	v2.16b, v25.16b
   4613 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   4614 	eor	x19, x19, x13                     //AES block 4k+5 - round 14 low
   4615 
   4616 	aese	v1.16b, v27.16b
   4617 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   4618 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   4619 
   4620 	aese	v3.16b, v25.16b
   4621 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   4622 	eor	x21, x21, x13                     //AES block 4k+6 - round 14 low
   4623 
   4624 	aese	v0.16b, v27.16b
   4625 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   4626 	movi	v8.8b, #0xc2
   4627 
   4628 	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
   4629 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   4630 	fmov	d5, x19                               //AES block 4k+5 - mov low
   4631 
   4632 	aese	v2.16b, v26.16b
   4633 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   4634 	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
   4635 #ifdef __AARCH64EB__
   4636 	rev	x6, x6
   4637 	rev	x7, x7
   4638 #endif
   4639 	aese	v0.16b, v28.16b
   4640 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   4641 	shl	d8, d8, #56               //mod_constant
   4642 
   4643 	aese	v3.16b, v26.16b
   4644 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   4645 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   4646 
   4647 	aese	v2.16b, v27.16b
   4648 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   4649 
   4650 	aese	v1.16b, v28.16b
   4651 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   4652 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
   4653 
   4654 	aese	v3.16b, v27.16b
   4655 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   4656 	add	w12, w12, #1                            //CTR block 4k+3
   4657 
   4658 	aese	v0.16b, v29.16b
   4659 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   4660 	eor	v4.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   4661 
   4662 	aese	v1.16b, v29.16b
   4663 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   4664 	add	x0, x0, #64                       //AES input_ptr update
   4665 
   4666 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   4667 	rev	w9, w12                                 //CTR block 4k+8
   4668 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   4669 
   4670 	aese	v2.16b, v28.16b
   4671 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   4672 	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
   4673 
   4674 	aese	v1.16b, v30.16b
   4675 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   4676 	eor	v10.16b, v10.16b, v4.16b                         //MODULO - karatsuba tidy up
   4677 
   4678 	aese	v3.16b, v28.16b
   4679 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   4680 	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
   4681 
   4682 	fmov	d4, x6                               //AES block 4k+4 - mov low
   4683 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   4684 	eor	v7.16b, v9.16b, v7.16b                   //MODULO - fold into mid
   4685 
   4686 	aese	v0.16b, v30.16b
   4687 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   4688 	eor	x20, x20, x14                     //AES block 4k+5 - round 14 high
   4689 
   4690 	aese	v2.16b, v29.16b
   4691 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   4692 	eor	x24, x24, x14                     //AES block 4k+7 - round 14 high
   4693 
   4694 	aese	v3.16b, v29.16b
   4695 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   4696 	add	w12, w12, #1                            //CTR block 4k+8
   4697 
   4698 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   4699 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
   4700 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   4701 
   4702 	aese	v2.16b, v30.16b
   4703 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   4704 	fmov	d7, x23                               //AES block 4k+7 - mov low
   4705 
   4706 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   4707 	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
   4708 
   4709 	fmov	d6, x21                               //AES block 4k+6 - mov low
   4710 	cmp	x0, x5                   //.LOOP CONTROL
   4711 
   4712 	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
   4713 
   4714 	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
   4715 	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
   4716 	fmov	d0, x10                               //CTR block 4k+8
   4717 
   4718 	fmov	v0.d[1], x9                               //CTR block 4k+8
   4719 	rev	w9, w12                                 //CTR block 4k+9
   4720 	add	w12, w12, #1                            //CTR block 4k+9
   4721 
   4722 	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
   4723 	fmov	d1, x10                               //CTR block 4k+9
   4724 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   4725 
   4726 	aese	v3.16b, v30.16b
   4727 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   4728 	fmov	v1.d[1], x9                               //CTR block 4k+9
   4729 
   4730 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   4731 	rev	w9, w12                                 //CTR block 4k+10
   4732 	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
   4733 
   4734 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   4735 	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
   4736 	fmov	v7.d[1], x24                           //AES block 4k+7 - mov high
   4737 
   4738 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   4739 	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
   4740 	add	w12, w12, #1                            //CTR block 4k+10
   4741 
   4742 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   4743 	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
   4744 	fmov	d2, x10                               //CTR block 4k+10
   4745 
   4746 	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
   4747 	fmov	v2.d[1], x9                               //CTR block 4k+10
   4748 	rev	w9, w12                                 //CTR block 4k+11
   4749 
   4750 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   4751 	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
   4752 
   4753 	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+7 - result
   4754 	st1	{ v7.16b}, [x2], #16                     //AES block 4k+7 - store result
   4755 	b.lt	.L256_enc_main_loop
   4756 
   4757 .L256_enc_prepretail:	//PREPRETAIL
   4758 	aese	v1.16b, v18.16b
   4759 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   4760 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
   4761 
   4762 	aese	v2.16b, v18.16b
   4763 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   4764 	fmov	d3, x10                               //CTR block 4k+3
   4765 
   4766 	aese	v0.16b, v18.16b
   4767 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   4768 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
   4769 
   4770 	fmov	v3.d[1], x9                               //CTR block 4k+3
   4771 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   4772 
   4773 	aese	v2.16b, v19.16b
   4774 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   4775 
   4776 	aese	v0.16b, v19.16b
   4777 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   4778 
   4779 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   4780 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
   4781 
   4782 	aese	v2.16b, v20.16b
   4783 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   4784 
   4785 	aese	v3.16b, v18.16b
   4786 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   4787 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   4788 
   4789 	aese	v1.16b, v19.16b
   4790 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   4791 
   4792 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   4793 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   4794 
   4795 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   4796 
   4797 	aese	v2.16b, v21.16b
   4798 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   4799 
   4800 	aese	v1.16b, v20.16b
   4801 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   4802 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   4803 
   4804 	aese	v0.16b, v20.16b
   4805 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   4806 
   4807 	aese	v3.16b, v19.16b
   4808 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   4809 
   4810 	aese	v1.16b, v21.16b
   4811 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   4812 
   4813 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   4814 
   4815 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   4816 
   4817 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   4818 
   4819 	aese	v3.16b, v20.16b
   4820 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   4821 
   4822 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   4823 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   4824 
   4825 	aese	v0.16b, v21.16b
   4826 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   4827 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   4828 
   4829 	aese	v3.16b, v21.16b
   4830 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   4831 
   4832 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   4833 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   4834 
   4835 	aese	v0.16b, v22.16b
   4836 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   4837 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4838 
   4839 	aese	v3.16b, v22.16b
   4840 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   4841 
   4842 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   4843 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   4844 	add	w12, w12, #1                            //CTR block 4k+3
   4845 
   4846 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   4847 
   4848 	aese	v3.16b, v23.16b
   4849 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   4850 
   4851 	aese	v2.16b, v22.16b
   4852 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   4853 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   4854 
   4855 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   4856 
   4857 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   4858 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   4859 
   4860 	aese	v2.16b, v23.16b
   4861 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   4862 
   4863 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   4864 	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
   4865 
   4866 	aese	v1.16b, v22.16b
   4867 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   4868 
   4869 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   4870 
   4871 	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
   4872 
   4873 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   4874 
   4875 	aese	v1.16b, v23.16b
   4876 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   4877 
   4878 	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
   4879 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   4880 
   4881 	aese	v0.16b, v23.16b
   4882 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   4883 
   4884 	aese	v1.16b, v24.16b
   4885 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   4886 
   4887 	aese	v2.16b, v24.16b
   4888 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   4889 
   4890 	aese	v0.16b, v24.16b
   4891 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   4892 	movi	v8.8b, #0xc2
   4893 
   4894 	aese	v3.16b, v24.16b
   4895 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   4896 
   4897 	aese	v1.16b, v25.16b
   4898 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   4899 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   4900 
   4901 	aese	v0.16b, v25.16b
   4902 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   4903 
   4904 	aese	v3.16b, v25.16b
   4905 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   4906 	shl	d8, d8, #56               //mod_constant
   4907 
   4908 	aese	v1.16b, v26.16b
   4909 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   4910 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
   4911 
   4912 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   4913 
   4914 	aese	v3.16b, v26.16b
   4915 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   4916 
   4917 	aese	v1.16b, v27.16b
   4918 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   4919 
   4920 	aese	v0.16b, v26.16b
   4921 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   4922 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   4923 
   4924 	aese	v3.16b, v27.16b
   4925 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   4926 
   4927 	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
   4928 
   4929 	pmull	v4.1q, v9.1d, v8.1d
   4930 	ext	v9.16b, v9.16b, v9.16b, #8
   4931 
   4932 	aese	v3.16b, v28.16b
   4933 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   4934 
   4935 	aese	v2.16b, v25.16b
   4936 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   4937 	eor	v10.16b, v10.16b, v11.16b
   4938 
   4939 	aese	v1.16b, v28.16b
   4940 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   4941 
   4942 	aese	v0.16b, v27.16b
   4943 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   4944 
   4945 	aese	v2.16b, v26.16b
   4946 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   4947 
   4948 	aese	v1.16b, v29.16b
   4949 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   4950 	eor	v10.16b, v10.16b, v4.16b
   4951 
   4952 	aese	v0.16b, v28.16b
   4953 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   4954 
   4955 	aese	v2.16b, v27.16b
   4956 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   4957 
   4958 	aese	v1.16b, v30.16b
   4959 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   4960 
   4961 	aese	v0.16b, v29.16b
   4962 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   4963 	eor	v10.16b, v10.16b, v9.16b
   4964 
   4965 	aese	v3.16b, v29.16b
   4966 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   4967 
   4968 	aese	v2.16b, v28.16b
   4969 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   4970 
   4971 	aese	v0.16b, v30.16b
   4972 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   4973 
   4974 	pmull	v4.1q, v10.1d, v8.1d
   4975 
   4976 	aese	v2.16b, v29.16b
   4977 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   4978 	ext	v10.16b, v10.16b, v10.16b, #8
   4979 
   4980 	aese	v3.16b, v30.16b
   4981 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   4982 
   4983 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   4984 	eor	v11.16b, v11.16b, v4.16b
   4985 
   4986 	aese	v2.16b, v30.16b
   4987 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   4988 
   4989 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   4990 
   4991 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   4992 
   4993 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   4994 	eor	v11.16b, v11.16b, v10.16b
   4995 .L256_enc_tail:	//TAIL
   4996 
   4997 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   4998 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   4999 	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
   5000 #ifdef __AARCH64EB__
   5001 	rev	x6, x6
   5002 	rev	x7, x7
   5003 #endif
   5004 	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
   5005 	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
   5006 
   5007 	cmp	x5, #48
   5008 	fmov	d4, x6                               //AES block 4k+4 - mov low
   5009 
   5010 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
   5011 
   5012 	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
   5013 	b.gt	.L256_enc_blocks_more_than_3
   5014 
   5015 	cmp	x5, #32
   5016 	mov	v3.16b, v2.16b
   5017 	movi	v11.8b, #0
   5018 
   5019 	movi	v9.8b, #0
   5020 	sub	w12, w12, #1
   5021 
   5022 	mov	v2.16b, v1.16b
   5023 	movi	v10.8b, #0
   5024 	b.gt	.L256_enc_blocks_more_than_2
   5025 
   5026 	mov	v3.16b, v1.16b
   5027 	sub	w12, w12, #1
   5028 	cmp	x5, #16
   5029 
   5030 	b.gt	.L256_enc_blocks_more_than_1
   5031 
   5032 	sub	w12, w12, #1
   5033 	b	.L256_enc_blocks_less_than_1
   5034 .L256_enc_blocks_more_than_3:	//blocks	left >  3
   5035 	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
   5036 
   5037 	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
   5038 #ifdef __AARCH64EB__
   5039 	rev	x6, x6
   5040 	rev	x7, x7
   5041 #endif
   5042 	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
   5043 
   5044 	eor	x6, x6, x13                    //AES final-2 block - round 14 low
   5045 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5046 
   5047 	eor	x7, x7, x14                    //AES final-2 block - round 14 high
   5048 
   5049 	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
   5050 	fmov	d5, x6                                //AES final-2 block - mov low
   5051 
   5052 	fmov	v5.d[1], x7                            //AES final-2 block - mov high
   5053 
   5054 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
   5055 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   5056 
   5057 	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
   5058 
   5059 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
   5060 
   5061 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
   5062 
   5063 	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
   5064 	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
   5065 .L256_enc_blocks_more_than_2:	//blocks	left >  2
   5066 
   5067 	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
   5068 
   5069 	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
   5070 #ifdef __AARCH64EB__
   5071 	rev	x6, x6
   5072 	rev	x7, x7
   5073 #endif
   5074 	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
   5075 
   5076 	eor	x6, x6, x13                    //AES final-1 block - round 14 low
   5077 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5078 
   5079 	fmov	d5, x6                                //AES final-1 block - mov low
   5080 	eor	x7, x7, x14                    //AES final-1 block - round 14 high
   5081 
   5082 	fmov	v5.d[1], x7                            //AES final-1 block - mov high
   5083 
   5084 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   5085 
   5086 	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
   5087 	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
   5088 
   5089 	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
   5090 
   5091 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
   5092 
   5093 	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
   5094 
   5095 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
   5096 
   5097 	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
   5098 
   5099 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
   5100 
   5101 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
   5102 .L256_enc_blocks_more_than_1:	//blocks	left >  1
   5103 
   5104 	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
   5105 
   5106 	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
   5107 
   5108 	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
   5109 #ifdef __AARCH64EB__
   5110 	rev	x6, x6
   5111 	rev	x7, x7
   5112 #endif
   5113 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5114 
   5115 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   5116 
   5117 	eor	x6, x6, x13                    //AES final block - round 14 low
   5118 	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
   5119 
   5120 	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
   5121 	eor	x7, x7, x14                    //AES final block - round 14 high
   5122 
   5123 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
   5124 
   5125 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
   5126 
   5127 	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
   5128 	fmov	d5, x6                                //AES final block - mov low
   5129 
   5130 	fmov	v5.d[1], x7                            //AES final block - mov high
   5131 
   5132 	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
   5133 
   5134 	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
   5135 
   5136 	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
   5137 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
   5138 
   5139 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
   5140 .L256_enc_blocks_less_than_1:	//blocks	left <= 1
   5141 
   5142 	and	x1, x1, #127                   //bit_length %= 128
   5143 
   5144 	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
   5145 	sub	x1, x1, #128                   //bit_length -= 128
   5146 
   5147 	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
   5148 	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
   5149 
   5150 	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
   5151 	and	x1, x1, #127                   //bit_length %= 128
   5152 
   5153 	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
   5154 	cmp	x1, #64
   5155 
   5156 	csel	x6, x13, x14, lt
   5157 	csel	x7, x14, xzr, lt
   5158 
   5159 	fmov	d0, x6                                //ctr0b is mask for last block
   5160 
   5161 	fmov	v0.d[1], x7
   5162 
   5163 	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
   5164 
   5165 	rev64	v4.16b, v5.16b                                   //GHASH final block
   5166 
   5167 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5168 
   5169 	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
   5170 
   5171 	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
   5172 	mov	d8, v4.d[1]                                 //GHASH final block - mid
   5173 #ifndef __AARCH64EB__
   5174 	rev	w9, w12
   5175 #else
   5176 	mov	w9, w12
   5177 #endif
   5178 
   5179 	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
   5180 
   5181 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
   5182 	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
   5183 
   5184 	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
   5185 
   5186 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
   5187 
   5188 	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
   5189 	movi	v8.8b, #0xc2
   5190 
   5191 	eor	v4.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
   5192 
   5193 	shl	d8, d8, #56              //mod_constant
   5194 
   5195 	eor	v10.16b, v10.16b, v4.16b                        //MODULO - karatsuba tidy up
   5196 
   5197 	pmull	v7.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
   5198 
   5199 	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
   5200 
   5201 	eor	v10.16b, v10.16b, v7.16b                     //MODULO - fold into mid
   5202 
   5203 	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
   5204 
   5205 	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
   5206 
   5207 	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
   5208 
   5209 	str	w9, [x16, #12]                         //store the updated counter
   5210 
   5211 	st1	{ v5.16b}, [x2]                         //store all 16B
   5212 	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
   5213 
   5214 	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
   5215 	ext	v11.16b, v11.16b, v11.16b, #8
   5216 	rev64	v11.16b, v11.16b
   5217 	mov	x0, x15
   5218 	st1	{ v11.16b }, [x3]
   5219 
   5220 	ldp	x21, x22, [sp, #16]
   5221 	ldp	x23, x24, [sp, #32]
   5222 	ldp	d8, d9, [sp, #48]
   5223 	ldp	d10, d11, [sp, #64]
   5224 	ldp	d12, d13, [sp, #80]
   5225 	ldp	d14, d15, [sp, #96]
   5226 	ldp	x19, x20, [sp], #112
   5227 	ret
   5228 
   5229 .L256_enc_ret:
   5230 	mov	w0, #0x0
   5231 	ret
   5232 .size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
   5233 .globl	aes_gcm_dec_256_kernel
   5234 .type	aes_gcm_dec_256_kernel,%function
   5235 .align	4
   5236 aes_gcm_dec_256_kernel:
   5237 	AARCH64_VALID_CALL_TARGET
   5238 	cbz	x1, .L256_dec_ret
   5239 	stp	x19, x20, [sp, #-112]!
   5240 	mov	x16, x4
   5241 	mov	x8, x5
   5242 	stp	x21, x22, [sp, #16]
   5243 	stp	x23, x24, [sp, #32]
   5244 	stp	d8, d9, [sp, #48]
   5245 	stp	d10, d11, [sp, #64]
   5246 	stp	d12, d13, [sp, #80]
   5247 	stp	d14, d15, [sp, #96]
   5248 
   5249 	lsr	x5, x1, #3              //byte_len
   5250 	mov	x15, x5
   5251 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   5252 #ifdef __AARCH64EB__
   5253 	rev	x10, x10
   5254 	rev	x11, x11
   5255 #endif
   5256 	ldp	x13, x14, [x8, #224]                     //load rk14
   5257 #ifdef __AARCH64EB__
   5258 	ror	x14, x14, #32
   5259 	ror	x13, x13, #32
   5260 #endif
   5261 	ld1	{v18.4s}, [x8], #16                               //load rk0
   5262 	sub	x5, x5, #1      //byte_len - 1
   5263 
   5264 	ld1	{v19.4s}, [x8], #16                               //load rk1
   5265 	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   5266 
   5267 	add	x4, x0, x1, lsr #3   //end_input_ptr
   5268 	ld1	{v20.4s}, [x8], #16                               //load rk2
   5269 
   5270 	lsr	x12, x11, #32
   5271 	ld1	{v21.4s}, [x8], #16                               //load rk3
   5272 	orr	w11, w11, w11
   5273 
   5274 	ld1	{v22.4s}, [x8], #16                               //load rk4
   5275 	add	x5, x5, x0
   5276 	rev	w12, w12                                //rev_ctr32
   5277 
   5278 	add	w12, w12, #1                            //increment rev_ctr32
   5279 	fmov	d3, x10                               //CTR block 3
   5280 
   5281 	rev	w9, w12                                 //CTR block 1
   5282 	add	w12, w12, #1                            //CTR block 1
   5283 	fmov	d1, x10                               //CTR block 1
   5284 
   5285 	orr	x9, x11, x9, lsl #32            //CTR block 1
   5286 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   5287 
   5288 	fmov	v1.d[1], x9                               //CTR block 1
   5289 	rev	w9, w12                                 //CTR block 2
   5290 	add	w12, w12, #1                            //CTR block 2
   5291 
   5292 	fmov	d2, x10                               //CTR block 2
   5293 	orr	x9, x11, x9, lsl #32            //CTR block 2
   5294 
   5295 	fmov	v2.d[1], x9                               //CTR block 2
   5296 	rev	w9, w12                                 //CTR block 3
   5297 
   5298 	orr	x9, x11, x9, lsl #32            //CTR block 3
   5299 	ld1	{v23.4s}, [x8], #16                               //load rk5
   5300 
   5301 	fmov	v3.d[1], x9                               //CTR block 3
   5302 	add	w12, w12, #1                            //CTR block 3
   5303 
   5304 	ld1	{v24.4s}, [x8], #16                               //load rk6
   5305 
   5306 	ld1	{v25.4s}, [x8], #16                               //load rk7
   5307 
   5308 	ld1	{v26.4s}, [x8], #16                               //load rk8
   5309 
   5310 	aese	v0.16b, v18.16b
   5311 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   5312 	ldr	q14, [x3, #80]                         //load h3l | h3h
   5313 #ifndef __AARCH64EB__
   5314 	ext	v14.16b, v14.16b, v14.16b, #8
   5315 #endif
   5316 
   5317 	aese	v3.16b, v18.16b
   5318 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   5319 	ldr	q15, [x3, #112]                        //load h4l | h4h
   5320 #ifndef __AARCH64EB__
   5321 	ext	v15.16b, v15.16b, v15.16b, #8
   5322 #endif
   5323 
   5324 	aese	v1.16b, v18.16b
   5325 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   5326 	ldr	q13, [x3, #64]                         //load h2l | h2h
   5327 #ifndef __AARCH64EB__
   5328 	ext	v13.16b, v13.16b, v13.16b, #8
   5329 #endif
   5330 
   5331 	aese	v2.16b, v18.16b
   5332 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   5333 	ld1	{v27.4s}, [x8], #16                                 //load rk9
   5334 
   5335 	aese	v0.16b, v19.16b
   5336 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   5337 
   5338 	aese	v1.16b, v19.16b
   5339 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   5340 	ld1	{ v11.16b}, [x3]
   5341 	ext	v11.16b, v11.16b, v11.16b, #8
   5342 	rev64	v11.16b, v11.16b
   5343 
   5344 	aese	v2.16b, v19.16b
   5345 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   5346 	ld1	{v28.4s}, [x8], #16                              //load rk10
   5347 
   5348 	aese	v3.16b, v19.16b
   5349 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   5350 	ld1	{v29.4s}, [x8], #16                              //load rk11
   5351 
   5352 	aese	v0.16b, v20.16b
   5353 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   5354 	ldr	q12, [x3, #32]                         //load h1l | h1h
   5355 #ifndef __AARCH64EB__
   5356 	ext	v12.16b, v12.16b, v12.16b, #8
   5357 #endif
   5358 	aese	v2.16b, v20.16b
   5359 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   5360 	ld1	{v30.4s}, [x8], #16                              //load rk12
   5361 
   5362 	aese	v3.16b, v20.16b
   5363 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   5364 
   5365 	aese	v0.16b, v21.16b
   5366 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   5367 
   5368 	aese	v1.16b, v20.16b
   5369 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   5370 
   5371 	aese	v3.16b, v21.16b
   5372 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   5373 
   5374 	aese	v0.16b, v22.16b
   5375 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   5376 	cmp	x0, x5                   //check if we have <= 4 blocks
   5377 
   5378 	aese	v2.16b, v21.16b
   5379 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   5380 
   5381 	aese	v1.16b, v21.16b
   5382 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   5383 
   5384 	aese	v3.16b, v22.16b
   5385 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   5386 
   5387 	aese	v2.16b, v22.16b
   5388 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   5389 
   5390 	aese	v1.16b, v22.16b
   5391 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   5392 
   5393 	aese	v3.16b, v23.16b
   5394 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   5395 
   5396 	aese	v0.16b, v23.16b
   5397 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   5398 
   5399 	aese	v1.16b, v23.16b
   5400 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   5401 
   5402 	aese	v2.16b, v23.16b
   5403 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   5404 
   5405 	aese	v0.16b, v24.16b
   5406 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   5407 
   5408 	aese	v3.16b, v24.16b
   5409 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   5410 
   5411 	aese	v1.16b, v24.16b
   5412 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   5413 
   5414 	aese	v2.16b, v24.16b
   5415 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   5416 
   5417 	aese	v0.16b, v25.16b
   5418 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   5419 
   5420 	aese	v1.16b, v25.16b
   5421 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   5422 
   5423 	aese	v3.16b, v25.16b
   5424 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   5425 
   5426 	aese	v0.16b, v26.16b
   5427 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   5428 
   5429 	aese	v2.16b, v25.16b
   5430 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   5431 
   5432 	aese	v3.16b, v26.16b
   5433 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   5434 
   5435 	aese	v1.16b, v26.16b
   5436 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   5437 
   5438 	aese	v0.16b, v27.16b
   5439 	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
   5440 
   5441 	aese	v2.16b, v26.16b
   5442 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   5443 	ld1	{v31.4s}, [x8], #16                             //load rk13
   5444 
   5445 	aese	v1.16b, v27.16b
   5446 	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
   5447 
   5448 	aese	v0.16b, v28.16b
   5449 	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
   5450 
   5451 	aese	v3.16b, v27.16b
   5452 	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
   5453 
   5454 	aese	v1.16b, v28.16b
   5455 	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
   5456 
   5457 	aese	v2.16b, v27.16b
   5458 	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
   5459 
   5460 	aese	v3.16b, v28.16b
   5461 	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
   5462 
   5463 	aese	v0.16b, v29.16b
   5464 	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
   5465 
   5466 	aese	v2.16b, v28.16b
   5467 	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
   5468 
   5469 	aese	v3.16b, v29.16b
   5470 	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
   5471 
   5472 	aese	v1.16b, v29.16b
   5473 	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
   5474 
   5475 	aese	v2.16b, v29.16b
   5476 	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
   5477 
   5478 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   5479 
   5480 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   5481 
   5482 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   5483 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   5484 
   5485 	aese	v1.16b, v30.16b
   5486 	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
   5487 
   5488 	aese	v0.16b, v30.16b
   5489 	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
   5490 
   5491 	aese	v2.16b, v30.16b
   5492 	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
   5493 
   5494 	aese	v3.16b, v30.16b
   5495 	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
   5496 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   5497 
   5498 	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
   5499 
   5500 	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
   5501 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   5502 
   5503 	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
   5504 
   5505 	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
   5506 	b.ge	.L256_dec_tail                                    //handle tail
   5507 
   5508 	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
   5509 
   5510 	rev	w9, w12                                 //CTR block 4
   5511 
   5512 	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
   5513 
   5514 	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
   5515 	rev64	v5.16b, v5.16b                                    //GHASH block 1
   5516 	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
   5517 
   5518 	mov	x7, v0.d[1]                            //AES block 0 - mov high
   5519 
   5520 	mov	x6, v0.d[0]                            //AES block 0 - mov low
   5521 	rev64	v4.16b, v4.16b                                    //GHASH block 0
   5522 	add	w12, w12, #1                            //CTR block 4
   5523 
   5524 	fmov	d0, x10                               //CTR block 4
   5525 	orr	x9, x11, x9, lsl #32            //CTR block 4
   5526 
   5527 	fmov	v0.d[1], x9                               //CTR block 4
   5528 	rev	w9, w12                                 //CTR block 5
   5529 	add	w12, w12, #1                            //CTR block 5
   5530 
   5531 	mov	x19, v1.d[0]                            //AES block 1 - mov low
   5532 
   5533 	orr	x9, x11, x9, lsl #32            //CTR block 5
   5534 	mov	x20, v1.d[1]                            //AES block 1 - mov high
   5535 	eor	x7, x7, x14                   //AES block 0 - round 14 high
   5536 #ifdef __AARCH64EB__
   5537 	rev	x7, x7
   5538 #endif
   5539 	eor	x6, x6, x13                   //AES block 0 - round 14 low
   5540 #ifdef __AARCH64EB__
   5541 	rev	x6, x6
   5542 #endif
   5543 	stp	x6, x7, [x2], #16        //AES block 0 - store result
   5544 	fmov	d1, x10                               //CTR block 5
   5545 
   5546 	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
   5547 
   5548 	fmov	v1.d[1], x9                               //CTR block 5
   5549 	rev	w9, w12                                 //CTR block 6
   5550 	add	w12, w12, #1                            //CTR block 6
   5551 
   5552 	eor	x19, x19, x13                   //AES block 1 - round 14 low
   5553 #ifdef __AARCH64EB__
   5554 	rev	x19, x19
   5555 #endif
   5556 	orr	x9, x11, x9, lsl #32            //CTR block 6
   5557 
   5558 	eor	x20, x20, x14                   //AES block 1 - round 14 high
   5559 #ifdef __AARCH64EB__
   5560 	rev	x20, x20
   5561 #endif
   5562 	stp	x19, x20, [x2], #16        //AES block 1 - store result
   5563 
   5564 	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
   5565 	cmp	x0, x5                   //check if we have <= 8 blocks
   5566 	b.ge	.L256_dec_prepretail                              //do prepretail
   5567 
   5568 .L256_dec_main_loop:	//main	loop start
   5569 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   5570 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   5571 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   5572 
   5573 	aese	v0.16b, v18.16b
   5574 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   5575 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   5576 
   5577 	aese	v1.16b, v18.16b
   5578 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   5579 	fmov	d2, x10                               //CTR block 4k+6
   5580 
   5581 	fmov	v2.d[1], x9                               //CTR block 4k+6
   5582 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   5583 	rev	w9, w12                                 //CTR block 4k+7
   5584 
   5585 	aese	v0.16b, v19.16b
   5586 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   5587 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   5588 
   5589 	aese	v1.16b, v19.16b
   5590 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   5591 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   5592 
   5593 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   5594 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   5595 	fmov	d3, x10                               //CTR block 4k+7
   5596 
   5597 	aese	v0.16b, v20.16b
   5598 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   5599 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   5600 
   5601 	aese	v2.16b, v18.16b
   5602 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   5603 	fmov	v3.d[1], x9                               //CTR block 4k+7
   5604 
   5605 	aese	v1.16b, v20.16b
   5606 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   5607 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   5608 
   5609 	aese	v0.16b, v21.16b
   5610 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   5611 	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
   5612 #ifdef __AARCH64EB__
   5613 	rev	x22, x22
   5614 #endif
   5615 	aese	v2.16b, v19.16b
   5616 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   5617 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   5618 
   5619 	aese	v1.16b, v21.16b
   5620 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   5621 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   5622 
   5623 	aese	v3.16b, v18.16b
   5624 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   5625 	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
   5626 #ifdef __AARCH64EB__
   5627 	rev	x21, x21
   5628 #endif
   5629 	aese	v2.16b, v20.16b
   5630 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   5631 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   5632 
   5633 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   5634 
   5635 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   5636 
   5637 	aese	v2.16b, v21.16b
   5638 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   5639 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   5640 
   5641 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   5642 	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
   5643 #ifdef __AARCH64EB__
   5644 	rev	x23, x23
   5645 #endif
   5646 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   5647 	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
   5648 #ifdef __AARCH64EB__
   5649 	rev	x24, x24
   5650 #endif
   5651 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   5652 
   5653 	aese	v2.16b, v22.16b
   5654 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   5655 
   5656 	aese	v3.16b, v19.16b
   5657 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   5658 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   5659 
   5660 	aese	v0.16b, v22.16b
   5661 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   5662 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   5663 
   5664 	aese	v2.16b, v23.16b
   5665 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   5666 	add	w12, w12, #1                            //CTR block 4k+7
   5667 
   5668 	aese	v3.16b, v20.16b
   5669 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   5670 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   5671 
   5672 	aese	v1.16b, v22.16b
   5673 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   5674 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   5675 
   5676 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   5677 
   5678 	aese	v3.16b, v21.16b
   5679 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   5680 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   5681 
   5682 	aese	v1.16b, v23.16b
   5683 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   5684 
   5685 	aese	v0.16b, v23.16b
   5686 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   5687 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   5688 
   5689 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   5690 	rev	w9, w12                                 //CTR block 4k+8
   5691 
   5692 	aese	v1.16b, v24.16b
   5693 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   5694 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   5695 
   5696 	aese	v0.16b, v24.16b
   5697 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   5698 	add	w12, w12, #1                            //CTR block 4k+8
   5699 
   5700 	aese	v3.16b, v22.16b
   5701 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   5702 
   5703 	aese	v1.16b, v25.16b
   5704 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   5705 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   5706 
   5707 	aese	v0.16b, v25.16b
   5708 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   5709 
   5710 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   5711 	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
   5712 
   5713 	aese	v3.16b, v23.16b
   5714 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   5715 
   5716 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   5717 
   5718 	aese	v0.16b, v26.16b
   5719 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   5720 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   5721 
   5722 	aese	v3.16b, v24.16b
   5723 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   5724 
   5725 	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   5726 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   5727 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   5728 
   5729 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   5730 
   5731 	aese	v0.16b, v27.16b
   5732 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   5733 	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
   5734 
   5735 	aese	v1.16b, v26.16b
   5736 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   5737 
   5738 	aese	v2.16b, v24.16b
   5739 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   5740 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   5741 
   5742 	aese	v0.16b, v28.16b
   5743 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   5744 
   5745 	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
   5746 	movi	v8.8b, #0xc2
   5747 
   5748 	aese	v2.16b, v25.16b
   5749 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   5750 	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
   5751 
   5752 	aese	v0.16b, v29.16b
   5753 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   5754 
   5755 	aese	v3.16b, v25.16b
   5756 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   5757 	shl	d8, d8, #56               //mod_constant
   5758 
   5759 	aese	v2.16b, v26.16b
   5760 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   5761 	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
   5762 
   5763 	aese	v0.16b, v30.16b
   5764 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   5765 
   5766 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   5767 	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   5768 
   5769 	aese	v1.16b, v27.16b
   5770 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   5771 	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
   5772 
   5773 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   5774 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   5775 
   5776 	aese	v1.16b, v28.16b
   5777 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   5778 	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
   5779 
   5780 	aese	v2.16b, v27.16b
   5781 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   5782 	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
   5783 
   5784 	aese	v3.16b, v26.16b
   5785 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   5786 	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
   5787 
   5788 	aese	v1.16b, v29.16b
   5789 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   5790 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   5791 
   5792 	aese	v2.16b, v28.16b
   5793 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   5794 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   5795 
   5796 	aese	v3.16b, v27.16b
   5797 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   5798 	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
   5799 
   5800 	aese	v1.16b, v30.16b
   5801 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   5802 	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
   5803 
   5804 	aese	v2.16b, v29.16b
   5805 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   5806 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   5807 
   5808 	aese	v3.16b, v28.16b
   5809 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   5810 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   5811 
   5812 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   5813 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   5814 
   5815 	aese	v2.16b, v30.16b
   5816 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   5817 	fmov	d0, x10                               //CTR block 4k+8
   5818 
   5819 	aese	v3.16b, v29.16b
   5820 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   5821 	fmov	v0.d[1], x9                               //CTR block 4k+8
   5822 
   5823 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   5824 	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
   5825 	rev	w9, w12                                 //CTR block 4k+9
   5826 
   5827 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   5828 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   5829 	cmp	x0, x5                   //.LOOP CONTROL
   5830 
   5831 	add	w12, w12, #1                            //CTR block 4k+9
   5832 
   5833 	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
   5834 #ifdef __AARCH64EB__
   5835 	rev	x6, x6
   5836 #endif
   5837 	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
   5838 #ifdef __AARCH64EB__
   5839 	rev	x7, x7
   5840 #endif
   5841 	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
   5842 	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
   5843 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   5844 
   5845 	aese	v3.16b, v30.16b
   5846 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   5847 	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
   5848 
   5849 	fmov	d1, x10                               //CTR block 4k+9
   5850 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   5851 
   5852 	fmov	v1.d[1], x9                               //CTR block 4k+9
   5853 	rev	w9, w12                                 //CTR block 4k+10
   5854 	add	w12, w12, #1                            //CTR block 4k+10
   5855 
   5856 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   5857 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   5858 
   5859 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
   5860 	eor	x20, x20, x14                   //AES block 4k+5 - round 14 high
   5861 #ifdef __AARCH64EB__
   5862 	rev	x20, x20
   5863 #endif
   5864 	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
   5865 
   5866 	eor	x19, x19, x13                   //AES block 4k+5 - round 14 low
   5867 #ifdef __AARCH64EB__
   5868 	rev	x19, x19
   5869 #endif
   5870 	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
   5871 
   5872 	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
   5873 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   5874 	b.lt	.L256_dec_main_loop
   5875 
   5876 
   5877 .L256_dec_prepretail:	//PREPRETAIL
   5878 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   5879 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   5880 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   5881 
   5882 	aese	v0.16b, v18.16b
   5883 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   5884 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   5885 
   5886 	aese	v1.16b, v18.16b
   5887 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   5888 	fmov	d2, x10                               //CTR block 4k+6
   5889 
   5890 	fmov	v2.d[1], x9                               //CTR block 4k+6
   5891 	rev	w9, w12                                 //CTR block 4k+7
   5892 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   5893 
   5894 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   5895 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   5896 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   5897 
   5898 	aese	v1.16b, v19.16b
   5899 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   5900 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   5901 
   5902 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   5903 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   5904 	fmov	d3, x10                               //CTR block 4k+7
   5905 
   5906 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   5907 	fmov	v3.d[1], x9                               //CTR block 4k+7
   5908 
   5909 	aese	v2.16b, v18.16b
   5910 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   5911 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   5912 
   5913 	aese	v0.16b, v19.16b
   5914 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   5915 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   5916 
   5917 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   5918 
   5919 	aese	v2.16b, v19.16b
   5920 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   5921 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   5922 
   5923 	aese	v3.16b, v18.16b
   5924 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   5925 
   5926 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   5927 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   5928 
   5929 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   5930 
   5931 	aese	v3.16b, v19.16b
   5932 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   5933 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   5934 
   5935 	aese	v0.16b, v20.16b
   5936 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   5937 
   5938 	aese	v1.16b, v20.16b
   5939 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   5940 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   5941 
   5942 	aese	v2.16b, v20.16b
   5943 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   5944 
   5945 	aese	v0.16b, v21.16b
   5946 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   5947 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   5948 
   5949 	aese	v3.16b, v20.16b
   5950 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   5951 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   5952 
   5953 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   5954 
   5955 	aese	v0.16b, v22.16b
   5956 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   5957 
   5958 	aese	v3.16b, v21.16b
   5959 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   5960 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   5961 
   5962 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   5963 
   5964 	aese	v0.16b, v23.16b
   5965 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   5966 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   5967 
   5968 	aese	v3.16b, v22.16b
   5969 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   5970 
   5971 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   5972 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   5973 
   5974 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   5975 
   5976 	aese	v3.16b, v23.16b
   5977 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   5978 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   5979 
   5980 	aese	v2.16b, v21.16b
   5981 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   5982 
   5983 	aese	v1.16b, v21.16b
   5984 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   5985 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   5986 
   5987 	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   5988 
   5989 	aese	v2.16b, v22.16b
   5990 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   5991 	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
   5992 
   5993 	aese	v1.16b, v22.16b
   5994 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   5995 
   5996 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   5997 
   5998 	aese	v2.16b, v23.16b
   5999 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   6000 	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
   6001 
   6002 	aese	v1.16b, v23.16b
   6003 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   6004 
   6005 	aese	v3.16b, v24.16b
   6006 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   6007 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   6008 
   6009 	aese	v2.16b, v24.16b
   6010 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   6011 
   6012 	aese	v0.16b, v24.16b
   6013 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   6014 	movi	v8.8b, #0xc2
   6015 
   6016 	aese	v1.16b, v24.16b
   6017 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   6018 	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
   6019 
   6020 	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
   6021 
   6022 	aese	v3.16b, v25.16b
   6023 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   6024 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   6025 
   6026 	aese	v1.16b, v25.16b
   6027 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   6028 
   6029 	aese	v0.16b, v25.16b
   6030 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   6031 	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
   6032 
   6033 	aese	v3.16b, v26.16b
   6034 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   6035 
   6036 	aese	v2.16b, v25.16b
   6037 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   6038 	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   6039 
   6040 	aese	v1.16b, v26.16b
   6041 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   6042 
   6043 	aese	v0.16b, v26.16b
   6044 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   6045 	shl	d8, d8, #56               //mod_constant
   6046 
   6047 	aese	v2.16b, v26.16b
   6048 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   6049 
   6050 	aese	v1.16b, v27.16b
   6051 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   6052 	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
   6053 
   6054 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   6055 
   6056 	aese	v2.16b, v27.16b
   6057 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   6058 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   6059 
   6060 	aese	v3.16b, v27.16b
   6061 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   6062 
   6063 	aese	v0.16b, v27.16b
   6064 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   6065 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   6066 
   6067 	aese	v2.16b, v28.16b
   6068 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   6069 
   6070 	aese	v3.16b, v28.16b
   6071 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   6072 
   6073 	aese	v0.16b, v28.16b
   6074 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   6075 	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
   6076 #ifdef __AARCH64EB__
   6077 	rev	x22, x22
   6078 #endif
   6079 	aese	v1.16b, v28.16b
   6080 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   6081 	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
   6082 #ifdef __AARCH64EB__
   6083 	rev	x23, x23
   6084 #endif
   6085 	aese	v2.16b, v29.16b
   6086 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   6087 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   6088 
   6089 	aese	v0.16b, v29.16b
   6090 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   6091 	add	w12, w12, #1                            //CTR block 4k+7
   6092 
   6093 	aese	v1.16b, v29.16b
   6094 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   6095 	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
   6096 #ifdef __AARCH64EB__
   6097 	rev	x21, x21
   6098 #endif
   6099 
   6100 	aese	v2.16b, v30.16b
   6101 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   6102 
   6103 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   6104 	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
   6105 #ifdef __AARCH64EB__
   6106 	rev	x24, x24
   6107 #endif
   6108 
   6109 	aese	v3.16b, v29.16b
   6110 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   6111 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   6112 
   6113 	aese	v1.16b, v30.16b
   6114 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   6115 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   6116 
   6117 	aese	v0.16b, v30.16b
   6118 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   6119 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   6120 
   6121 	aese	v3.16b, v30.16b
   6122 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   6123 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   6124 
   6125 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   6126 
   6127 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   6128 
   6129 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   6130 
   6131 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   6132 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   6133 .L256_dec_tail:	//TAIL
   6134 
   6135 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   6136 	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
   6137 
   6138 	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
   6139 
   6140 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   6141 
   6142 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   6143 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   6144 
   6145 	cmp	x5, #48
   6146 
   6147 	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
   6148 #ifdef __AARCH64EB__
   6149 	rev	x6, x6
   6150 #endif
   6151 
   6152 	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
   6153 #ifdef __AARCH64EB__
   6154 	rev	x7, x7
   6155 #endif
   6156 	b.gt	.L256_dec_blocks_more_than_3
   6157 
   6158 	sub	w12, w12, #1
   6159 	mov	v3.16b, v2.16b
   6160 	movi	v10.8b, #0
   6161 
   6162 	movi	v11.8b, #0
   6163 	cmp	x5, #32
   6164 
   6165 	movi	v9.8b, #0
   6166 	mov	v2.16b, v1.16b
   6167 	b.gt	.L256_dec_blocks_more_than_2
   6168 
   6169 	sub	w12, w12, #1
   6170 
   6171 	mov	v3.16b, v1.16b
   6172 	cmp	x5, #16
   6173 	b.gt	.L256_dec_blocks_more_than_1
   6174 
   6175 	sub	w12, w12, #1
   6176 	b	.L256_dec_blocks_less_than_1
   6177 .L256_dec_blocks_more_than_3:	//blocks	left >  3
   6178 	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
   6179 	ld1	{ v5.16b}, [x0], #16                     //AES final-2 block - load ciphertext
   6180 
   6181 	stp	x6, x7, [x2], #16       //AES final-3 block  - store result
   6182 
   6183 	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
   6184 
   6185 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   6186 
   6187 	eor	v0.16b, v5.16b, v1.16b                           //AES final-2 block - result
   6188 
   6189 	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
   6190 
   6191 	mov	x6, v0.d[0]                           //AES final-2 block - mov low
   6192 
   6193 	mov	x7, v0.d[1]                           //AES final-2 block - mov high
   6194 
   6195 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
   6196 
   6197 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   6198 
   6199 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
   6200 
   6201 	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
   6202 	eor	x6, x6, x13                  //AES final-2 block - round 14 low
   6203 #ifdef __AARCH64EB__
   6204 	rev	x6, x6
   6205 #endif
   6206 
   6207 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
   6208 	eor	x7, x7, x14                  //AES final-2 block - round 14 high
   6209 #ifdef __AARCH64EB__
   6210 	rev	x7, x7
   6211 #endif
   6212 .L256_dec_blocks_more_than_2:	//blocks	left >  2
   6213 
   6214 	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
   6215 	ld1	{ v5.16b}, [x0], #16                     //AES final-1 block - load ciphertext
   6216 
   6217 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   6218 	stp	x6, x7, [x2], #16       //AES final-2 block  - store result
   6219 
   6220 	eor	v0.16b, v5.16b, v2.16b                           //AES final-1 block - result
   6221 
   6222 	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
   6223 
   6224 	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
   6225 
   6226 	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
   6227 
   6228 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
   6229 	mov	x6, v0.d[0]                           //AES final-1 block - mov low
   6230 
   6231 	mov	x7, v0.d[1]                           //AES final-1 block - mov high
   6232 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
   6233 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   6234 
   6235 	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
   6236 
   6237 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
   6238 	eor	x6, x6, x13                  //AES final-1 block - round 14 low
   6239 #ifdef __AARCH64EB__
   6240 	rev	x6, x6
   6241 #endif
   6242 
   6243 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
   6244 	eor	x7, x7, x14                  //AES final-1 block - round 14 high
   6245 #ifdef __AARCH64EB__
   6246 	rev	x7, x7
   6247 #endif
   6248 .L256_dec_blocks_more_than_1:	//blocks	left >  1
   6249 
   6250 	stp	x6, x7, [x2], #16       //AES final-1 block  - store result
   6251 	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
   6252 
   6253 	ld1	{ v5.16b}, [x0], #16                     //AES final block - load ciphertext
   6254 
   6255 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   6256 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   6257 
   6258 	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
   6259 
   6260 	eor	v0.16b, v5.16b, v3.16b                           //AES final block - result
   6261 
   6262 	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
   6263 
   6264 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
   6265 
   6266 	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
   6267 	mov	x6, v0.d[0]                           //AES final block - mov low
   6268 
   6269 	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
   6270 
   6271 	mov	x7, v0.d[1]                           //AES final block - mov high
   6272 
   6273 	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
   6274 	eor	x6, x6, x13                  //AES final block - round 14 low
   6275 #ifdef __AARCH64EB__
   6276 	rev	x6, x6
   6277 #endif
   6278 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
   6279 
   6280 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
   6281 
   6282 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
   6283 	eor	x7, x7, x14                  //AES final block - round 14 high
   6284 #ifdef __AARCH64EB__
   6285 	rev	x7, x7
   6286 #endif
   6287 .L256_dec_blocks_less_than_1:	//blocks	left <= 1
   6288 
   6289 	and	x1, x1, #127                   //bit_length %= 128
   6290 	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
   6291 
   6292 	sub	x1, x1, #128                   //bit_length -= 128
   6293 	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
   6294 
   6295 	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
   6296 	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
   6297 
   6298 	and	x1, x1, #127                   //bit_length %= 128
   6299 
   6300 	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
   6301 	cmp	x1, #64
   6302 
   6303 	csel	x9, x13, x14, lt
   6304 	csel	x10, x14, xzr, lt
   6305 
   6306 	fmov	d0, x9                                  //ctr0b is mask for last block
   6307 	and	x6, x6, x9
   6308 
   6309 	mov	v0.d[1], x10
   6310 	bic	x4, x4, x9          //mask out low existing bytes
   6311 
   6312 #ifndef __AARCH64EB__
   6313 	rev	w9, w12
   6314 #else
   6315 	mov	w9, w12
   6316 #endif
   6317 
   6318 	bic	x5, x5, x10      //mask out high existing bytes
   6319 
   6320 	orr	x6, x6, x4
   6321 
   6322 	and	x7, x7, x10
   6323 
   6324 	orr	x7, x7, x5
   6325 
   6326 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
   6327 
   6328 	rev64	v4.16b, v5.16b                                    //GHASH final block
   6329 
   6330 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   6331 
   6332 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
   6333 
   6334 	mov	d8, v4.d[1]                                  //GHASH final block - mid
   6335 
   6336 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
   6337 
   6338 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
   6339 
   6340 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
   6341 
   6342 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
   6343 
   6344 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
   6345 
   6346 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
   6347 	movi	v8.8b, #0xc2
   6348 
   6349 	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   6350 
   6351 	shl	d8, d8, #56               //mod_constant
   6352 
   6353 	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
   6354 
   6355 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   6356 
   6357 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   6358 
   6359 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   6360 
   6361 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   6362 
   6363 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   6364 
   6365 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   6366 
   6367 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   6368 
   6369 	stp	x6, x7, [x2]
   6370 
   6371 	str	w9, [x16, #12]                          //store the updated counter
   6372 
   6373 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   6374 	ext	v11.16b, v11.16b, v11.16b, #8
   6375 	rev64	v11.16b, v11.16b
   6376 	mov	x0, x15
   6377 	st1	{ v11.16b }, [x3]
   6378 
   6379 	ldp	x21, x22, [sp, #16]
   6380 	ldp	x23, x24, [sp, #32]
   6381 	ldp	d8, d9, [sp, #48]
   6382 	ldp	d10, d11, [sp, #64]
   6383 	ldp	d12, d13, [sp, #80]
   6384 	ldp	d14, d15, [sp, #96]
   6385 	ldp	x19, x20, [sp], #112
   6386 	ret
   6387 
   6388 .L256_dec_ret:
   6389 	mov	w0, #0x0
   6390 	ret
   6391 .size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
   6392 .section	.rodata
   6393 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   6394 .align	2
   6395 .align	2
   6396 #endif
   6397