Home | History | Annotate | Line # | Download | only in aarch64
      1 #include "arm_arch.h"
      2 
      3 #if __ARM_MAX_ARCH__>=8
      4 .arch	armv8-a+crypto
      5 .text
      6 .globl	unroll8_eor3_aes_gcm_enc_128_kernel
      7 .type	unroll8_eor3_aes_gcm_enc_128_kernel,%function
      8 .align	4
      9 unroll8_eor3_aes_gcm_enc_128_kernel:
     10 	AARCH64_VALID_CALL_TARGET
     11 	cbz	x1, .L128_enc_ret
     12 	stp	d8, d9, [sp, #-80]!
     13 	lsr	x9, x1, #3
     14 	mov	x16, x4
     15 	mov	x8, x5
     16 	stp	d10, d11, [sp, #16]
     17 	stp	d12, d13, [sp, #32]
     18 	stp	d14, d15, [sp, #48]
     19 	mov	x5, #0xc200000000000000
     20 	stp	x5, xzr, [sp, #64]
     21 	add	x10, sp, #64
     22 
     23 	mov	x15, #0x100000000				//set up counter increment
     24 	movi	v31.16b, #0x0
     25 	mov	v31.d[1], x15
     26 	mov	x5, x9
     27 	ld1	{ v0.16b}, [x16]					//CTR block 0
     28 
     29 	sub	x5, x5, #1	 	//byte_len - 1
     30 
     31 	and	x5, x5, #0xffffffffffffff80		//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
     32 
     33 	rev32	v30.16b, v0.16b				//set up reversed counter
     34 
     35 	add	v30.4s, v30.4s, v31.4s		//CTR block 0
     36 
     37 	rev32	v1.16b, v30.16b				//CTR block 1
     38 	add	v30.4s, v30.4s, v31.4s		//CTR block 1
     39 
     40 	rev32	v2.16b, v30.16b				//CTR block 2
     41 	add	v30.4s, v30.4s, v31.4s		//CTR block 2
     42 
     43 	rev32	v3.16b, v30.16b				//CTR block 3
     44 	add	v30.4s, v30.4s, v31.4s		//CTR block 3
     45 
     46 	rev32	v4.16b, v30.16b				//CTR block 4
     47 	add	v30.4s, v30.4s, v31.4s		//CTR block 4
     48 
     49 	rev32	v5.16b, v30.16b				//CTR block 5
     50 	add	v30.4s, v30.4s, v31.4s		//CTR block 5
     51 	ldp	q26, q27, [x8, #0]				  	//load rk0, rk1
     52 
     53 	rev32	v6.16b, v30.16b				//CTR block 6
     54 	add	v30.4s, v30.4s, v31.4s		//CTR block 6
     55 
     56 	rev32	v7.16b, v30.16b				//CTR block 7
     57 	add	v30.4s, v30.4s, v31.4s		//CTR block 7
     58 
     59 	aese	v4.16b, v26.16b
     60 	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
     61 	aese	v6.16b, v26.16b
     62 	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
     63 	aese	v3.16b, v26.16b
     64 	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
     65 
     66 	aese	v0.16b, v26.16b
     67 	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
     68 	aese	v1.16b, v26.16b
     69 	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
     70 	aese	v2.16b, v26.16b
     71 	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
     72 
     73 	aese	v7.16b, v26.16b
     74 	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
     75 	aese	v5.16b, v26.16b
     76 	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
     77 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
     78 
     79 	aese	v3.16b, v27.16b
     80 	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
     81 
     82 	aese	v7.16b, v27.16b
     83 	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
     84 	aese	v5.16b, v27.16b
     85 	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
     86 	aese	v4.16b, v27.16b
     87 	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
     88 
     89 	aese	v2.16b, v27.16b
     90 	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
     91 	aese	v6.16b, v27.16b
     92 	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
     93 	aese	v0.16b, v27.16b
     94 	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
     95 
     96 	aese	v5.16b, v28.16b
     97 	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
     98 	aese	v1.16b, v27.16b
     99 	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
    100 	aese	v0.16b, v28.16b
    101 	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
    102 
    103 	aese	v2.16b, v28.16b
    104 	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
    105 	aese	v3.16b, v28.16b
    106 	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
    107 	aese	v7.16b, v28.16b
    108 	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
    109 
    110 	aese	v1.16b, v28.16b
    111 	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
    112 	aese	v6.16b, v28.16b
    113 	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
    114 	aese	v4.16b, v28.16b
    115 	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
    116 
    117 	aese	v2.16b, v26.16b
    118 	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
    119 
    120 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
    121 	aese	v5.16b, v26.16b
    122 	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
    123 	aese	v0.16b, v26.16b
    124 	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
    125 
    126 	aese	v4.16b, v26.16b
    127 	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
    128 	aese	v3.16b, v26.16b
    129 	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
    130 	aese	v6.16b, v26.16b
    131 	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
    132 
    133 	aese	v7.16b, v26.16b
    134 	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
    135 
    136 	aese	v6.16b, v27.16b
    137 	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
    138 	aese	v1.16b, v26.16b
    139 	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
    140 	aese	v5.16b, v27.16b
    141 	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
    142 
    143 	aese	v7.16b, v27.16b
    144 	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
    145 	aese	v4.16b, v27.16b
    146 	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
    147 	aese	v0.16b, v27.16b
    148 	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
    149 
    150 	aese	v1.16b, v27.16b
    151 	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
    152 	aese	v2.16b, v27.16b
    153 	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
    154 	aese	v3.16b, v27.16b
    155 	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
    156 
    157 	aese	v7.16b, v28.16b
    158 	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
    159 	aese	v0.16b, v28.16b
    160 	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
    161 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
    162 
    163 	aese	v1.16b, v28.16b
    164 	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
    165 	aese	v3.16b, v28.16b
    166 	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
    167 	aese	v2.16b, v28.16b
    168 	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
    169 
    170 	aese	v4.16b, v28.16b
    171 	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
    172 	aese	v5.16b, v28.16b
    173 	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
    174 	aese	v6.16b, v28.16b
    175 	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
    176 
    177 	aese	v4.16b, v26.16b
    178 	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
    179 	aese	v3.16b, v26.16b
    180 	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
    181 	aese	v2.16b, v26.16b
    182 	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
    183 
    184 	aese	v7.16b, v26.16b
    185 	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
    186 	aese	v6.16b, v26.16b
    187 	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
    188 	aese	v5.16b, v26.16b
    189 	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
    190 
    191 	aese	v0.16b, v26.16b
    192 	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
    193 	aese	v1.16b, v26.16b
    194 	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
    195 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
    196 
    197 	aese	v5.16b, v27.16b
    198 	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
    199 
    200 	ld1	{ v19.16b}, [x3]
    201 	ext	v19.16b, v19.16b, v19.16b, #8
    202 	rev64	v19.16b, v19.16b
    203 
    204 	aese	v7.16b, v27.16b
    205 	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
    206 
    207 	aese	v4.16b, v27.16b
    208 	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
    209 	aese	v3.16b, v27.16b
    210 	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
    211 	aese	v6.16b, v27.16b
    212 	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
    213 
    214 	aese	v1.16b, v27.16b
    215 	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
    216 	aese	v2.16b, v27.16b
    217 	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
    218 	aese	v0.16b, v27.16b
    219 	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
    220 
    221 	aese	v3.16b, v28.16b
    222 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
    223 	aese	v6.16b, v28.16b
    224 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
    225 	aese	v2.16b, v28.16b
    226 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
    227 
    228 	aese	v7.16b, v28.16b
    229 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
    230 	aese	v0.16b, v28.16b
    231 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
    232 	ldr	q27, [x8, #160]					//load rk10
    233 
    234 	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
    235 	aese	v4.16b, v28.16b
    236 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
    237 	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
    238 
    239 	aese	v5.16b, v28.16b
    240 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
    241 	aese	v1.16b, v28.16b
    242 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
    243 	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
    244 
    245 	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
    246 	add	x5, x5, x0
    247 	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
    248 
    249 	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
    250 	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
    251 	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
    252 
    253 	add	x4, x0, x1, lsr #3		//end_input_ptr
    254 	cmp	x0, x5				//check if we have <= 8 blocks
    255 	b.ge	.L128_enc_tail						//handle tail
    256 
    257 	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext
    258 
    259 	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext
    260 
    261 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
    262 
    263 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
    264 	cmp	x0, x5				//check if we have <= 8 blocks
    265 
    266 .inst	0xce006d08	//eor3 v8.16b, v8.16b, v0.16b, v27.16b				//AES block 0 - result
    267 	rev32	v0.16b, v30.16b				//CTR block 8
    268 	add	v30.4s, v30.4s, v31.4s		//CTR block 8
    269 
    270 .inst	0xce016d29	//eor3 v9.16b, v9.16b, v1.16b, v27.16b				//AES block 1 - result
    271 	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result
    272 
    273 	rev32	v1.16b, v30.16b				//CTR block 9
    274 .inst	0xce056dad	//eor3 v13.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
    275 	add	v30.4s, v30.4s, v31.4s		//CTR block 9
    276 
    277 .inst	0xce026d4a	//eor3 v10.16b, v10.16b, v2.16b, v27.16b				//AES block 2 - result
    278 .inst	0xce066dce	//eor3 v14.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
    279 .inst	0xce046d8c	//eor3 v12.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
    280 
    281 	rev32	v2.16b, v30.16b				//CTR block 10
    282 	add	v30.4s, v30.4s, v31.4s		//CTR block 10
    283 
    284 .inst	0xce036d6b	//eor3 v11.16b, v11.16b, v3.16b, v27.16b				//AES block 3 - result
    285 .inst	0xce076def	//eor3 v15.16b, v15.16b, v7.16b,v27.16b				//AES block 7 - result
    286 	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
    287 
    288 	rev32	v3.16b, v30.16b				//CTR block 11
    289 	add	v30.4s, v30.4s, v31.4s		//CTR block 11
    290 	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
    291 
    292 	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
    293 
    294 	rev32	v4.16b, v30.16b				//CTR block 12
    295 	add	v30.4s, v30.4s, v31.4s		//CTR block 12
    296 	b.ge	.L128_enc_prepretail					//do prepretail
    297 
    298 .L128_enc_main_loop:	//main	loop start
    299 	rev32	v5.16b, v30.16b				//CTR block 8k+13
    300 	ldr	q20, [x3, #128]				//load h5l | h5h
    301 	ext	v20.16b, v20.16b, v20.16b, #8
    302 	ldr	q22, [x3, #160]				//load h6l | h6h
    303 	ext	v22.16b, v22.16b, v22.16b, #8
    304 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
    305 
    306 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
    307 	rev64	v8.16b, v8.16b						//GHASH block 8k
    308 	ldr	q23, [x3, #176]				//load h7l | h7h
    309 	ext	v23.16b, v23.16b, v23.16b, #8
    310 	ldr	q25, [x3, #208]				//load h8l | h8h
    311 	ext	v25.16b, v25.16b, v25.16b, #8
    312 
    313 	rev32	v6.16b, v30.16b				//CTR block 8k+14
    314 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
    315 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
    316 
    317 	ldr	q21, [x3, #144]				//load h6k | h5k
    318 	ldr	q24, [x3, #192]				//load h8k | h7k
    319 	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
    320 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
    321 
    322 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
    323 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
    324 	rev32	v7.16b, v30.16b				//CTR block 8k+15
    325 
    326 	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
    327 
    328 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
    329 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
    330 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
    331 
    332 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
    333 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
    334 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
    335 
    336 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
    337 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
    338 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
    339 
    340 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
    341 	ldr	q23, [x3, #80]				//load h3l | h3h
    342 	ext	v23.16b, v23.16b, v23.16b, #8
    343 	ldr	q25, [x3, #112]				//load h3l | h3h
    344 	ext	v25.16b, v25.16b, v25.16b, #8
    345 	aese	v5.16b, v26.16b
    346 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
    347 
    348 	aese	v1.16b, v26.16b
    349 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
    350 	aese	v4.16b, v26.16b
    351 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
    352 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
    353 
    354 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
    355 	aese	v2.16b, v26.16b
    356 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
    357 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
    358 
    359 	aese	v6.16b, v26.16b
    360 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
    361 	aese	v1.16b, v27.16b
    362 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
    363 	aese	v0.16b, v26.16b
    364 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
    365 
    366 	aese	v2.16b, v27.16b
    367 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
    368 	aese	v3.16b, v26.16b
    369 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
    370 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
    371 
    372 	aese	v5.16b, v27.16b
    373 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
    374 	aese	v7.16b, v26.16b
    375 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
    376 	aese	v0.16b, v27.16b
    377 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
    378 
    379 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b,v9.16b			//GHASH block 8k+2, 8k+3 - high
    380 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
    381 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
    382 
    383 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
    384 	aese	v4.16b, v27.16b
    385 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
    386 	aese	v3.16b, v27.16b
    387 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
    388 
    389 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
    390 	aese	v7.16b, v27.16b
    391 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
    392 	aese	v6.16b, v27.16b
    393 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
    394 
    395 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
    396 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
    397 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
    398 
    399 	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
    400 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
    401 
    402 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
    403 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
    404 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
    405 
    406 	aese	v5.16b, v28.16b
    407 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
    408 	aese	v4.16b, v28.16b
    409 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
    410 	aese	v2.16b, v28.16b
    411 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
    412 
    413 	aese	v1.16b, v28.16b
    414 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
    415 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
    416 	aese	v6.16b, v28.16b
    417 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
    418 
    419 	aese	v0.16b, v28.16b
    420 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
    421 	aese	v3.16b, v28.16b
    422 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
    423 	aese	v7.16b, v28.16b
    424 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
    425 
    426 	aese	v6.16b, v26.16b
    427 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
    428 	ldr	q21, [x3, #48]				//load h2k | h1k
    429 	ldr	q24, [x3, #96]				//load h4k | h3k
    430 	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
    431 
    432 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
    433 	aese	v2.16b, v26.16b
    434 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
    435 	aese	v1.16b, v26.16b
    436 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
    437 
    438 	ldr	q20, [x3, #32]				//load h1l | h1h
    439 	ext	v20.16b, v20.16b, v20.16b, #8
    440 	ldr	q22, [x3, #64]				//load h1l | h1h
    441 	ext	v22.16b, v22.16b, v22.16b, #8
    442 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
    443 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
    444 
    445 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
    446 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
    447 
    448 	aese	v0.16b, v26.16b
    449 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
    450 	aese	v3.16b, v26.16b
    451 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
    452 
    453 	aese	v7.16b, v26.16b
    454 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
    455 	aese	v4.16b, v26.16b
    456 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
    457 
    458 	aese	v5.16b, v26.16b
    459 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
    460 	aese	v0.16b, v27.16b
    461 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
    462 
    463 	aese	v7.16b, v27.16b
    464 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
    465 	aese	v3.16b, v27.16b
    466 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
    467 	aese	v4.16b, v27.16b
    468 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
    469 
    470 	aese	v5.16b, v27.16b
    471 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
    472 	aese	v6.16b, v27.16b
    473 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
    474 	aese	v1.16b, v27.16b
    475 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
    476 
    477 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
    478 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
    479 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
    480 
    481 	aese	v2.16b, v27.16b
    482 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
    483 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
    484 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
    485 
    486 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
    487 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
    488 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
    489 
    490 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
    491 	aese	v2.16b, v28.16b
    492 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
    493 	aese	v5.16b, v28.16b
    494 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
    495 
    496 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
    497 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
    498 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
    499 
    500 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
    501 	aese	v6.16b, v28.16b
    502 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
    503 
    504 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
    505 	aese	v7.16b, v28.16b
    506 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
    507 	aese	v1.16b, v28.16b
    508 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
    509 
    510 	aese	v3.16b, v28.16b
    511 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
    512 	aese	v4.16b, v28.16b
    513 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
    514 	aese	v0.16b, v28.16b
    515 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
    516 
    517 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
    518 	ldr	d16, [x10]			//MODULO - load modulo constant
    519 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
    520 
    521 	aese	v7.16b, v26.16b
    522 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
    523 	aese	v5.16b, v26.16b
    524 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
    525 
    526 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
    527 	aese	v1.16b, v26.16b
    528 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
    529 	aese	v2.16b, v26.16b
    530 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
    531 
    532 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
    533 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
    534 	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
    535 
    536 	aese	v3.16b, v26.16b
    537 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
    538 	rev32	v20.16b, v30.16b					//CTR block 8k+16
    539 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
    540 
    541 	aese	v4.16b, v26.16b
    542 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
    543 	aese	v0.16b, v26.16b
    544 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
    545 	aese	v6.16b, v26.16b
    546 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
    547 
    548 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
    549 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
    550 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
    551 
    552 	aese	v2.16b, v27.16b
    553 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
    554 	aese	v7.16b, v27.16b
    555 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
    556 	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
    557 
    558 	aese	v5.16b, v27.16b
    559 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
    560 	aese	v6.16b, v27.16b
    561 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
    562 	aese	v1.16b, v27.16b
    563 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
    564 
    565 	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
    566 	aese	v0.16b, v27.16b
    567 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
    568 	aese	v4.16b, v27.16b
    569 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
    570 
    571 	rev32	v22.16b, v30.16b					//CTR block 8k+17
    572 	aese	v3.16b, v27.16b
    573 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
    574 
    575 	aese	v5.16b, v28.16b
    576 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
    577 	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load plaintext
    578 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
    579 
    580 	aese	v2.16b, v28.16b
    581 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
    582 	aese	v1.16b, v28.16b
    583 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
    584 	aese	v7.16b, v28.16b
    585 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
    586 
    587 	aese	v4.16b, v28.16b
    588 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
    589 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
    590 	ldr	q27, [x8, #160]					//load rk10
    591 
    592 	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
    593 	rev32	v23.16b, v30.16b					//CTR block 8k+18
    594 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
    595 	aese	v3.16b, v28.16b
    596 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
    597 
    598 	aese	v0.16b, v28.16b
    599 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
    600 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
    601 	aese	v6.16b, v28.16b
    602 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
    603 
    604 	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
    605 	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
    606 	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
    607 
    608 	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load plaintext
    609 	rev32	v25.16b, v30.16b					//CTR block 8k+19
    610 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
    611 
    612 	cmp	x0, x5				//.LOOP CONTROL
    613 .inst	0xce046d8c	//eor3 v12.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
    614 	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
    615 
    616 	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
    617 	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
    618 
    619 .inst	0xce026d4a	//eor3 v10.16b, v10.16b, v2.16b, v27.16b				//AES block 8k+10 - result
    620 
    621 	mov	v2.16b, v23.16b					//CTR block 8k+18
    622 	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
    623 
    624 	rev32	v4.16b, v30.16b				//CTR block 8k+20
    625 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
    626 
    627 .inst	0xce076def	//eor3 v15.16b, v15.16b, v7.16b, v27.16b				//AES block 7 - result
    628 	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
    629 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
    630 
    631 .inst	0xce016d29	//eor3 v9.16b, v9.16b, v1.16b, v27.16b				//AES block 8k+9 - result
    632 .inst	0xce036d6b	//eor3 v11.16b, v11.16b, v3.16b, v27.16b				//AES block 8k+11 - result
    633 	mov	v3.16b, v25.16b					//CTR block 8k+19
    634 
    635 	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
    636 .inst	0xce056dad	//eor3 v13.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
    637 	mov	v1.16b, v22.16b					//CTR block 8k+17
    638 
    639 .inst	0xce006d08	//eor3 v8.16b, v8.16b, v0.16b, v27.16b				//AES block 8k+8 - result
    640 	mov	v0.16b, v20.16b					//CTR block 8k+16
    641 	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
    642 
    643 	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
    644 .inst	0xce066dce	//eor3 v14.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
    645 
    646 	stp	q12, q13, [x2], #32			//AES block 8k+12, 8k+13 - store result
    647 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
    648 
    649 	stp	q14, q15, [x2], #32			//AES block 8k+14, 8k+15 - store result
    650 	b.lt	.L128_enc_main_loop
    651 
    652 .L128_enc_prepretail:	//PREPRETAIL
    653 	rev32	v5.16b, v30.16b				//CTR block 8k+13
    654 	ldr	q23, [x3, #176]				//load h7l | h7h
    655 	ext	v23.16b, v23.16b, v23.16b, #8
    656 	ldr	q25, [x3, #208]				//load h8l | h8h
    657 	ext	v25.16b, v25.16b, v25.16b, #8
    658 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
    659 
    660 	ldr	q20, [x3, #128]				//load h5l | h5h
    661 	ext	v20.16b, v20.16b, v20.16b, #8
    662 	ldr	q22, [x3, #160]				//load h6l | h6h
    663 	ext	v22.16b, v22.16b, v22.16b, #8
    664 	rev64	v8.16b, v8.16b						//GHASH block 8k
    665 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
    666 
    667 	ldr	q21, [x3, #144]				//load h6k | h5k
    668 	ldr	q24, [x3, #192]				//load h6k | h5k
    669 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
    670 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
    671 
    672 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
    673 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
    674 
    675 	rev32	v6.16b, v30.16b				//CTR block 8k+14
    676 
    677 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
    678 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
    679 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
    680 
    681 	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
    682 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
    683 
    684 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
    685 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
    686 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
    687 
    688 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
    689 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
    690 
    691 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
    692 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
    693 
    694 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
    695 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
    696 
    697 	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
    698 	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
    699 
    700 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
    701 
    702 	rev32	v7.16b, v30.16b				//CTR block 8k+15
    703 
    704 	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
    705 
    706 	aese	v2.16b, v26.16b
    707 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
    708 
    709 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
    710 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
    711 
    712 	aese	v6.16b, v26.16b
    713 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
    714 	aese	v3.16b, v26.16b
    715 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
    716 
    717 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
    718 	aese	v1.16b, v26.16b
    719 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
    720 
    721 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
    722 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
    723 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
    724 
    725 	aese	v5.16b, v26.16b
    726 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
    727 	aese	v7.16b, v26.16b
    728 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
    729 
    730 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
    731 	aese	v4.16b, v26.16b
    732 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
    733 	aese	v0.16b, v26.16b
    734 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
    735 
    736 	aese	v3.16b, v27.16b
    737 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
    738 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
    739 
    740 	ldr	q23, [x3, #80]				//load h3l | h3h
    741 	ext	v23.16b, v23.16b, v23.16b, #8
    742 	ldr	q25, [x3, #112]				//load h4l | h4h
    743 	ext	v25.16b, v25.16b, v25.16b, #8
    744 
    745 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
    746 	aese	v5.16b, v27.16b
    747 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
    748 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
    749 
    750 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
    751 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
    752 
    753 	aese	v1.16b, v27.16b
    754 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
    755 	aese	v0.16b, v27.16b
    756 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
    757 
    758 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
    759 	ldr	q21, [x3, #48]				//load h2k | h1k
    760 	ldr	q24, [x3, #96]				//load h4k | h3k
    761 	aese	v2.16b, v27.16b
    762 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
    763 
    764 	aese	v4.16b, v27.16b
    765 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
    766 	aese	v7.16b, v27.16b
    767 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
    768 
    769 	aese	v5.16b, v28.16b
    770 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
    771 	aese	v2.16b, v28.16b
    772 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
    773 	aese	v3.16b, v28.16b
    774 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
    775 
    776 	aese	v1.16b, v28.16b
    777 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
    778 	aese	v6.16b, v27.16b
    779 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
    780 	aese	v4.16b, v28.16b
    781 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
    782 
    783 	aese	v5.16b, v26.16b
    784 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
    785 	aese	v0.16b, v28.16b
    786 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
    787 
    788 	aese	v6.16b, v28.16b
    789 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
    790 	aese	v7.16b, v28.16b
    791 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
    792 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
    793 
    794 	ldr	q20, [x3, #32]				//load h1l | h1h
    795 	ext	v20.16b, v20.16b, v20.16b, #8
    796 	ldr	q22, [x3, #64]				//load h1l | h1h
    797 	ext	v22.16b, v22.16b, v22.16b, #8
    798 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
    799 	aese	v0.16b, v26.16b
    800 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
    801 
    802 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
    803 	aese	v6.16b, v26.16b
    804 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
    805 	aese	v3.16b, v26.16b
    806 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
    807 
    808 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
    809 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
    810 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
    811 
    812 	aese	v2.16b, v26.16b
    813 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
    814 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
    815 
    816 	aese	v7.16b, v26.16b
    817 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
    818 	aese	v1.16b, v26.16b
    819 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
    820 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
    821 
    822 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
    823 	aese	v4.16b, v26.16b
    824 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
    825 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
    826 
    827 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
    828 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
    829 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
    830 
    831 	aese	v1.16b, v27.16b
    832 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
    833 	aese	v3.16b, v27.16b
    834 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
    835 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
    836 
    837 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
    838 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
    839 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
    840 
    841 	aese	v1.16b, v28.16b
    842 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
    843 	aese	v6.16b, v27.16b
    844 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
    845 	aese	v0.16b, v27.16b
    846 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
    847 
    848 	aese	v7.16b, v27.16b
    849 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
    850 	aese	v2.16b, v27.16b
    851 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
    852 
    853 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
    854 	aese	v4.16b, v27.16b
    855 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
    856 	aese	v5.16b, v27.16b
    857 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
    858 
    859 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
    860 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
    861 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
    862 
    863 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
    864 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
    865 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
    866 
    867 	aese	v0.16b, v28.16b
    868 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
    869 	aese	v7.16b, v28.16b
    870 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
    871 	ldr	d16, [x10]			//MODULO - load modulo constant
    872 
    873 	aese	v2.16b, v28.16b
    874 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
    875 	aese	v4.16b, v28.16b
    876 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
    877 
    878 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
    879 	aese	v5.16b, v28.16b
    880 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
    881 	aese	v6.16b, v28.16b
    882 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
    883 
    884 	aese	v3.16b, v28.16b
    885 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
    886 	aese	v4.16b, v26.16b
    887 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
    888 
    889 	aese	v5.16b, v26.16b
    890 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
    891 	aese	v2.16b, v26.16b
    892 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
    893 	aese	v0.16b, v26.16b
    894 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
    895 
    896 	aese	v3.16b, v26.16b
    897 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
    898 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
    899 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
    900 
    901 	aese	v6.16b, v26.16b
    902 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
    903 	aese	v1.16b, v26.16b
    904 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
    905 	aese	v7.16b, v26.16b
    906 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
    907 
    908 	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
    909 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
    910 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
    911 
    912 	aese	v3.16b, v27.16b
    913 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
    914 	aese	v6.16b, v27.16b
    915 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
    916 	aese	v1.16b, v27.16b
    917 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
    918 	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
    919 
    920 	aese	v5.16b, v27.16b
    921 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
    922 	aese	v0.16b, v27.16b
    923 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
    924 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
    925 
    926 	aese	v2.16b, v27.16b
    927 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
    928 	aese	v7.16b, v27.16b
    929 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
    930 
    931 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
    932 	aese	v4.16b, v27.16b
    933 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
    934 
    935 	aese	v7.16b, v28.16b
    936 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
    937 	aese	v2.16b, v28.16b
    938 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
    939 	aese	v1.16b, v28.16b
    940 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
    941 	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
    942 
    943 	aese	v6.16b, v28.16b
    944 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
    945 .inst	0xce114a73	//eor3 v19.16b, v19.16b, v17.16b, v18.16b		 	//MODULO - fold into low
    946 	aese	v4.16b, v28.16b
    947 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
    948 
    949 	aese	v3.16b, v28.16b
    950 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
    951 	aese	v0.16b, v28.16b
    952 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
    953 	aese	v5.16b, v28.16b
    954 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
    955 
    956 	ldr	q27, [x8, #160]					//load rk10
    957 	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
    958 	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
    959 
    960 	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
    961 	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
    962 
    963 	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
    964 	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
    965 
    966 	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
    967 	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
    968 .L128_enc_tail:	//TAIL
    969 
    970 	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
    971 	ldr	q8, [x0], #16				//AES block 8k+8 - load plaintext
    972 
    973 	mov	v29.16b, v27.16b
    974 	ldp	q20, q21, [x3, #128]			//load h5l | h5h
    975 	ext	v20.16b, v20.16b, v20.16b, #8
    976 
    977 .inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b			//AES block 8k+8 - result
    978 	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
    979 	ldp	q22, q23, [x3, #160]			//load h6l | h6h
    980 	ext	v22.16b, v22.16b, v22.16b, #8
    981 	ext	v23.16b, v23.16b, v23.16b, #8
    982 
    983 	ldp	q24, q25, [x3, #192]			//load h8k | h7k
    984 	ext	v25.16b, v25.16b, v25.16b, #8
    985 	cmp	x5, #112
    986 	b.gt	.L128_enc_blocks_more_than_7
    987 
    988 	mov	v7.16b, v6.16b
    989 	mov	v6.16b, v5.16b
    990 	movi	v17.8b, #0
    991 
    992 	cmp	x5, #96
    993 	sub	v30.4s, v30.4s, v31.4s
    994 	mov	v5.16b, v4.16b
    995 
    996 	mov	v4.16b, v3.16b
    997 	mov	v3.16b, v2.16b
    998 	mov	v2.16b, v1.16b
    999 
   1000 	movi	v19.8b, #0
   1001 	movi	v18.8b, #0
   1002 	b.gt	.L128_enc_blocks_more_than_6
   1003 
   1004 	mov	v7.16b, v6.16b
   1005 	cmp	x5, #80
   1006 
   1007 	sub	v30.4s, v30.4s, v31.4s
   1008 	mov	v6.16b, v5.16b
   1009 	mov	v5.16b, v4.16b
   1010 
   1011 	mov	v4.16b, v3.16b
   1012 	mov	v3.16b, v1.16b
   1013 	b.gt	.L128_enc_blocks_more_than_5
   1014 
   1015 	cmp	x5, #64
   1016 	sub	v30.4s, v30.4s, v31.4s
   1017 
   1018 	mov	v7.16b, v6.16b
   1019 	mov	v6.16b, v5.16b
   1020 
   1021 	mov	v5.16b, v4.16b
   1022 	mov	v4.16b, v1.16b
   1023 	b.gt	.L128_enc_blocks_more_than_4
   1024 
   1025 	mov	v7.16b, v6.16b
   1026 	sub	v30.4s, v30.4s, v31.4s
   1027 	mov	v6.16b, v5.16b
   1028 
   1029 	mov	v5.16b, v1.16b
   1030 	cmp	x5, #48
   1031 	b.gt	.L128_enc_blocks_more_than_3
   1032 
   1033 	sub	v30.4s, v30.4s, v31.4s
   1034 	mov	v7.16b, v6.16b
   1035 	mov	v6.16b, v1.16b
   1036 
   1037 	cmp	x5, #32
   1038 	ldr	q24, [x3, #96]					//load h4k | h3k
   1039 	b.gt	.L128_enc_blocks_more_than_2
   1040 
   1041 	cmp	x5, #16
   1042 
   1043 	sub	v30.4s, v30.4s, v31.4s
   1044 	mov	v7.16b, v1.16b
   1045 	b.gt	.L128_enc_blocks_more_than_1
   1046 
   1047 	ldr	q21, [x3, #48]					//load h2k | h1k
   1048 	sub	v30.4s, v30.4s, v31.4s
   1049 	b	.L128_enc_blocks_less_than_1
   1050 .L128_enc_blocks_more_than_7:	//blocks	left >  7
   1051 	st1	{ v9.16b}, [x2], #16				//AES final-7 block  - store result
   1052 
   1053 	rev64	v8.16b, v9.16b						//GHASH final-7 block
   1054 	ldr	q9, [x0], #16				//AES final-6 block - load plaintext
   1055 
   1056 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1057 
   1058 	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
   1059 
   1060 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
   1061 
   1062 	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
   1063 
   1064 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
   1065 	movi	v16.8b, #0						//suppress further partial tag feed in
   1066 
   1067 .inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
   1068 
   1069 	pmull	v18.1q, v27.1d, v18.1d				//GHASH final-7 block - mid
   1070 	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
   1071 .L128_enc_blocks_more_than_6:	//blocks	left >  6
   1072 
   1073 	st1	{ v9.16b}, [x2], #16				//AES final-6 block - store result
   1074 
   1075 	rev64	v8.16b, v9.16b						//GHASH final-6 block
   1076 	ldr	q9, [x0], #16				//AES final-5 block - load plaintext
   1077 
   1078 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1079 
   1080 	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
   1081 
   1082 .inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
   1083 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
   1084 
   1085 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
   1086 	movi	v16.8b, #0						//suppress further partial tag feed in
   1087 
   1088 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
   1089 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
   1090 
   1091 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
   1092 
   1093 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
   1094 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
   1095 .L128_enc_blocks_more_than_5:	//blocks	left >  5
   1096 
   1097 	st1	{ v9.16b}, [x2], #16				//AES final-5 block - store result
   1098 
   1099 	rev64	v8.16b, v9.16b						//GHASH final-5 block
   1100 
   1101 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1102 
   1103 	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
   1104 	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
   1105 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
   1106 
   1107 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
   1108 
   1109 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
   1110 
   1111 	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
   1112 
   1113 .inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
   1114 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
   1115 	movi	v16.8b, #0						//suppress further partial tag feed in
   1116 
   1117 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
   1118 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
   1119 
   1120 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
   1121 .L128_enc_blocks_more_than_4:	//blocks	left >  4
   1122 
   1123 	st1	{ v9.16b}, [x2], #16			  	//AES final-4 block - store result
   1124 
   1125 	rev64	v8.16b, v9.16b						//GHASH final-4 block
   1126 
   1127 	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
   1128 
   1129 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1130 
   1131 	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
   1132 	movi	v16.8b, #0						//suppress further partial tag feed in
   1133 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
   1134 
   1135 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
   1136 
   1137 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
   1138 
   1139 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
   1140 	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
   1141 
   1142 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
   1143 
   1144 .inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
   1145 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
   1146 .L128_enc_blocks_more_than_3:	//blocks	left >  3
   1147 
   1148 	st1	{ v9.16b}, [x2], #16			  	//AES final-3 block - store result
   1149 
   1150 	ldr	q25, [x3, #112]				//load h4l | h4h
   1151 	ext	v25.16b, v25.16b, v25.16b, #8
   1152 
   1153 	rev64	v8.16b, v9.16b						//GHASH final-3 block
   1154 
   1155 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1156 	movi	v16.8b, #0						//suppress further partial tag feed in
   1157 
   1158 	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
   1159 	ldr	q24, [x3, #96]				//load h4k | h3k
   1160 	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
   1161 
   1162 	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
   1163 
   1164 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
   1165 
   1166 	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
   1167 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
   1168 
   1169 .inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
   1170 
   1171 	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
   1172 	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
   1173 
   1174 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
   1175 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
   1176 .L128_enc_blocks_more_than_2:	//blocks	left >  2
   1177 
   1178 	st1	{ v9.16b}, [x2], #16			  	//AES final-2 block - store result
   1179 
   1180 	rev64	v8.16b, v9.16b						//GHASH final-2 block
   1181 
   1182 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1183 
   1184 	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
   1185 
   1186 	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
   1187 	ldr	q23, [x3, #80]				//load h3l | h3h
   1188 	ext	v23.16b, v23.16b, v23.16b, #8
   1189 	movi	v16.8b, #0						//suppress further partial tag feed in
   1190 
   1191 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
   1192 .inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
   1193 
   1194 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
   1195 
   1196 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
   1197 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
   1198 
   1199 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
   1200 
   1201 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
   1202 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
   1203 .L128_enc_blocks_more_than_1:	//blocks	left >  1
   1204 
   1205 	st1	{ v9.16b}, [x2], #16			  	//AES final-1 block - store result
   1206 
   1207 	ldr	q22, [x3, #64]				//load h2l | h2h
   1208 	ext	v22.16b, v22.16b, v22.16b, #8
   1209 	rev64	v8.16b, v9.16b						//GHASH final-1 block
   1210 	ldr	q9, [x0], #16				//AES final block - load plaintext
   1211 
   1212 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1213 
   1214 	movi	v16.8b, #0						//suppress further partial tag feed in
   1215 	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
   1216 .inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
   1217 
   1218 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
   1219 
   1220 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
   1221 
   1222 	ldr	q21, [x3, #48]				//load h2k | h1k
   1223 
   1224 	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
   1225 
   1226 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
   1227 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
   1228 
   1229 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
   1230 
   1231 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
   1232 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
   1233 .L128_enc_blocks_less_than_1:	//blocks	left <= 1
   1234 
   1235 	rev32	v30.16b, v30.16b
   1236 	str	q30, [x16]					//store the updated counter
   1237 	and	x1, x1, #127			 	//bit_length %= 128
   1238 
   1239 	sub	x1, x1, #128			 	//bit_length -= 128
   1240 
   1241 	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
   1242 
   1243 	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
   1244 	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
   1245 	and	x1, x1, #127			 	//bit_length %= 128
   1246 
   1247 	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
   1248 	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
   1249 	cmp	x1, #64
   1250 
   1251 	csel	x13, x7, x6, lt
   1252 	csel	x14, x6, xzr, lt
   1253 
   1254 	mov	v0.d[1], x14
   1255 	mov	v0.d[0], x13					//ctr0b is mask for last block
   1256 
   1257 	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
   1258 
   1259 	rev64	v8.16b, v9.16b						//GHASH final block
   1260 
   1261 	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
   1262 	st1	{ v9.16b}, [x2]				//store all 16B
   1263 
   1264 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   1265 
   1266 	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
   1267 
   1268 	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
   1269 	ldr	q20, [x3, #32]				//load h1l | h1h
   1270 	ext	v20.16b, v20.16b, v20.16b, #8
   1271 
   1272 	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
   1273 
   1274 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
   1275 	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
   1276 	ldr	d16, [x10]			//MODULO - load modulo constant
   1277 
   1278 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
   1279 
   1280 	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
   1281 
   1282 	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
   1283 
   1284 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   1285 	pmull	v29.1q, v17.1d, v16.1d		  	//MODULO - top 64b align with mid
   1286 
   1287 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		  	//MODULO - karatsuba tidy up
   1288 
   1289 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b		 	//MODULO - fold into mid
   1290 
   1291 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   1292 	ext	v21.16b, v18.16b, v18.16b, #8			  	//MODULO - other mid alignment
   1293 
   1294 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		  	//MODULO - fold into low
   1295 	ext	v19.16b, v19.16b, v19.16b, #8
   1296 	rev64	v19.16b, v19.16b
   1297 	st1	{ v19.16b }, [x3]
   1298 	mov	x0, x9
   1299 
   1300 	ldp	d10, d11, [sp, #16]
   1301 	ldp	d12, d13, [sp, #32]
   1302 	ldp	d14, d15, [sp, #48]
   1303 	ldp	d8, d9, [sp], #80
   1304 	ret
   1305 
   1306 .L128_enc_ret:
   1307 	mov	w0, #0x0
   1308 	ret
   1309 .size	unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
   1310 .globl	unroll8_eor3_aes_gcm_dec_128_kernel
   1311 .type	unroll8_eor3_aes_gcm_dec_128_kernel,%function
   1312 .align	4
   1313 unroll8_eor3_aes_gcm_dec_128_kernel:
   1314 	AARCH64_VALID_CALL_TARGET
   1315 	cbz	x1, .L128_dec_ret
   1316 	stp	d8, d9, [sp, #-80]!
   1317 	lsr	x9, x1, #3
   1318 	mov	x16, x4
   1319 	mov	x8, x5
   1320 	stp	d10, d11, [sp, #16]
   1321 	stp	d12, d13, [sp, #32]
   1322 	stp	d14, d15, [sp, #48]
   1323 	mov	x5, #0xc200000000000000
   1324 	stp	x5, xzr, [sp, #64]
   1325 	add	x10, sp, #64
   1326 
   1327 	mov	x5, x9
   1328 	ld1	{ v0.16b}, [x16]					//CTR block 0
   1329 
   1330 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   1331 	sub	x5, x5, #1		//byte_len - 1
   1332 
   1333 	mov	x15, #0x100000000				//set up counter increment
   1334 	movi	v31.16b, #0x0
   1335 	mov	v31.d[1], x15
   1336 	ld1	{ v19.16b}, [x3]
   1337 	ext	v19.16b, v19.16b, v19.16b, #8
   1338 	rev64	v19.16b, v19.16b
   1339 
   1340 	rev32	v30.16b, v0.16b				//set up reversed counter
   1341 
   1342 	aese	v0.16b, v26.16b
   1343 	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
   1344 
   1345 	add	v30.4s, v30.4s, v31.4s		//CTR block 0
   1346 
   1347 	rev32	v1.16b, v30.16b				//CTR block 1
   1348 	add	v30.4s, v30.4s, v31.4s		//CTR block 1
   1349 
   1350 	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   1351 
   1352 	rev32	v2.16b, v30.16b				//CTR block 2
   1353 	add	v30.4s, v30.4s, v31.4s		//CTR block 2
   1354 	aese	v1.16b, v26.16b
   1355 	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
   1356 
   1357 	rev32	v3.16b, v30.16b				//CTR block 3
   1358 	add	v30.4s, v30.4s, v31.4s		//CTR block 3
   1359 
   1360 	aese	v0.16b, v27.16b
   1361 	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
   1362 	aese	v1.16b, v27.16b
   1363 	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
   1364 
   1365 	rev32	v4.16b, v30.16b				//CTR block 4
   1366 	add	v30.4s, v30.4s, v31.4s		//CTR block 4
   1367 
   1368 	rev32	v5.16b, v30.16b				//CTR block 5
   1369 	add	v30.4s, v30.4s, v31.4s		//CTR block 5
   1370 
   1371 	aese	v2.16b, v26.16b
   1372 	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
   1373 
   1374 	rev32	v6.16b, v30.16b				//CTR block 6
   1375 	add	v30.4s, v30.4s, v31.4s		//CTR block 6
   1376 	aese	v5.16b, v26.16b
   1377 	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
   1378 
   1379 	aese	v3.16b, v26.16b
   1380 	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
   1381 	aese	v4.16b, v26.16b
   1382 	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
   1383 
   1384 	rev32	v7.16b, v30.16b				//CTR block 7
   1385 
   1386 	aese	v6.16b, v26.16b
   1387 	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
   1388 	aese	v2.16b, v27.16b
   1389 	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
   1390 
   1391 	aese	v7.16b, v26.16b
   1392 	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
   1393 
   1394 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   1395 
   1396 	aese	v6.16b, v27.16b
   1397 	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
   1398 	aese	v5.16b, v27.16b
   1399 	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
   1400 
   1401 	aese	v4.16b, v27.16b
   1402 	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
   1403 	aese	v7.16b, v27.16b
   1404 	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
   1405 
   1406 	aese	v7.16b, v28.16b
   1407 	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
   1408 	aese	v0.16b, v28.16b
   1409 	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
   1410 	aese	v3.16b, v27.16b
   1411 	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
   1412 
   1413 	aese	v6.16b, v28.16b
   1414 	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
   1415 	aese	v2.16b, v28.16b
   1416 	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
   1417 	aese	v5.16b, v28.16b
   1418 	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
   1419 
   1420 	aese	v4.16b, v28.16b
   1421 	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
   1422 	aese	v3.16b, v28.16b
   1423 	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
   1424 	aese	v1.16b, v28.16b
   1425 	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
   1426 
   1427 	aese	v6.16b, v26.16b
   1428 	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
   1429 	aese	v2.16b, v26.16b
   1430 	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
   1431 
   1432 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   1433 	aese	v5.16b, v26.16b
   1434 	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
   1435 
   1436 	aese	v0.16b, v26.16b
   1437 	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
   1438 	aese	v7.16b, v26.16b
   1439 	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
   1440 
   1441 	aese	v3.16b, v26.16b
   1442 	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
   1443 	aese	v1.16b, v26.16b
   1444 	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
   1445 
   1446 	aese	v0.16b, v27.16b
   1447 	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
   1448 	aese	v7.16b, v27.16b
   1449 	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
   1450 	aese	v4.16b, v26.16b
   1451 	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
   1452 
   1453 	aese	v6.16b, v27.16b
   1454 	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
   1455 	aese	v1.16b, v27.16b
   1456 	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
   1457 	aese	v3.16b, v27.16b
   1458 	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
   1459 
   1460 	aese	v5.16b, v27.16b
   1461 	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
   1462 	aese	v4.16b, v27.16b
   1463 	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
   1464 	aese	v2.16b, v27.16b
   1465 	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
   1466 
   1467 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   1468 	aese	v2.16b, v28.16b
   1469 	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
   1470 	aese	v3.16b, v28.16b
   1471 	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
   1472 
   1473 	aese	v6.16b, v28.16b
   1474 	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
   1475 	aese	v1.16b, v28.16b
   1476 	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
   1477 
   1478 	aese	v7.16b, v28.16b
   1479 	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
   1480 	aese	v5.16b, v28.16b
   1481 	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
   1482 
   1483 	aese	v4.16b, v28.16b
   1484 	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
   1485 
   1486 	aese	v3.16b, v26.16b
   1487 	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
   1488 	aese	v2.16b, v26.16b
   1489 	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
   1490 	aese	v0.16b, v28.16b
   1491 	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
   1492 
   1493 	aese	v5.16b, v26.16b
   1494 	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
   1495 	aese	v4.16b, v26.16b
   1496 	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
   1497 	aese	v1.16b, v26.16b
   1498 	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
   1499 
   1500 	aese	v0.16b, v26.16b
   1501 	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
   1502 	aese	v7.16b, v26.16b
   1503 	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
   1504 	aese	v6.16b, v26.16b
   1505 	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
   1506 
   1507 	aese	v3.16b, v27.16b
   1508 	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
   1509 	aese	v4.16b, v27.16b
   1510 	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
   1511 	aese	v1.16b, v27.16b
   1512 	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
   1513 
   1514 	aese	v7.16b, v27.16b
   1515 	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
   1516 	aese	v5.16b, v27.16b
   1517 	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
   1518 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   1519 
   1520 	aese	v6.16b, v27.16b
   1521 	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
   1522 	aese	v2.16b, v27.16b
   1523 	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
   1524 	aese	v0.16b, v27.16b
   1525 	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
   1526 
   1527 	add	x5, x5, x0
   1528 	add	v30.4s, v30.4s, v31.4s		//CTR block 7
   1529 
   1530 	aese	v6.16b, v28.16b
   1531 	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
   1532 	aese	v0.16b, v28.16b
   1533 	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
   1534 
   1535 	aese	v1.16b, v28.16b
   1536 	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
   1537 	aese	v7.16b, v28.16b
   1538 	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
   1539 	aese	v3.16b, v28.16b
   1540 	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
   1541 
   1542 	aese	v5.16b, v28.16b
   1543 	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
   1544 	aese	v2.16b, v28.16b
   1545 	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
   1546 	aese	v4.16b, v28.16b
   1547 	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
   1548 
   1549 	aese	v0.16b, v26.16b						//AES block 0 - round 9
   1550 	aese	v1.16b, v26.16b						//AES block 1 - round 9
   1551 	aese	v6.16b, v26.16b						//AES block 6 - round 9
   1552 
   1553 	ldr	q27, [x8, #160]					//load rk10
   1554 	aese	v4.16b, v26.16b						//AES block 4 - round 9
   1555 	aese	v3.16b, v26.16b						//AES block 3 - round 9
   1556 
   1557 	aese	v2.16b, v26.16b						//AES block 2 - round 9
   1558 	aese	v5.16b, v26.16b						//AES block 5 - round 9
   1559 	aese	v7.16b, v26.16b						//AES block 7 - round 9
   1560 
   1561 	add	x4, x0, x1, lsr #3		//end_input_ptr
   1562 	cmp	x0, x5				//check if we have <= 8 blocks
   1563 	b.ge	.L128_dec_tail						//handle tail
   1564 
   1565 	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext
   1566 
   1567 .inst	0xce006d00	//eor3 v0.16b, v8.16b, v0.16b, v27.16b				//AES block 0 - result
   1568 .inst	0xce016d21	//eor3 v1.16b, v9.16b, v1.16b, v27.16b				//AES block 1 - result
   1569 	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result
   1570 
   1571 	rev32	v0.16b, v30.16b				//CTR block 8
   1572 	add	v30.4s, v30.4s, v31.4s		//CTR block 8
   1573 	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext
   1574 
   1575 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext
   1576 
   1577 	rev32	v1.16b, v30.16b				//CTR block 9
   1578 	add	v30.4s, v30.4s, v31.4s		//CTR block 9
   1579 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
   1580 
   1581 .inst	0xce036d63	//eor3 v3.16b, v11.16b, v3.16b, v27.16b				//AES block 3 - result
   1582 .inst	0xce026d42	//eor3 v2.16b, v10.16b, v2.16b, v27.16b				//AES block 2 - result
   1583 	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
   1584 
   1585 	rev32	v2.16b, v30.16b				//CTR block 10
   1586 	add	v30.4s, v30.4s, v31.4s		//CTR block 10
   1587 
   1588 .inst	0xce066dc6	//eor3 v6.16b, v14.16b, v6.16b, v27.16b				//AES block 6 - result
   1589 
   1590 	rev32	v3.16b, v30.16b				//CTR block 11
   1591 	add	v30.4s, v30.4s, v31.4s		//CTR block 11
   1592 
   1593 .inst	0xce046d84	//eor3 v4.16b, v12.16b, v4.16b, v27.16b				//AES block 4 - result
   1594 .inst	0xce056da5	//eor3 v5.16b, v13.16b, v5.16b, v27.16b				//AES block 5 - result
   1595 	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
   1596 
   1597 .inst	0xce076de7	//eor3 v7.16b, v15.16b, v7.16b, v27.16b				//AES block 7 - result
   1598 	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
   1599 	rev32	v4.16b, v30.16b				//CTR block 12
   1600 
   1601 	cmp	x0, x5				//check if we have <= 8 blocks
   1602 	add	v30.4s, v30.4s, v31.4s		//CTR block 12
   1603 	b.ge	.L128_dec_prepretail					//do prepretail
   1604 
   1605 .L128_dec_main_loop:	//main	loop start
   1606 	ldr	q23, [x3, #176]				//load h7l | h7h
   1607 	ext	v23.16b, v23.16b, v23.16b, #8
   1608 	ldr	q25, [x3, #208]				//load h8l | h8h
   1609 	ext	v25.16b, v25.16b, v25.16b, #8
   1610 
   1611 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   1612 	rev64	v8.16b, v8.16b						//GHASH block 8k
   1613 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   1614 
   1615 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   1616 	ldr	q20, [x3, #128]				//load h5l | h5h
   1617 	ext	v20.16b, v20.16b, v20.16b, #8
   1618 	ldr	q22, [x3, #160]				//load h6l | h6h
   1619 	ext	v22.16b, v22.16b, v22.16b, #8
   1620 
   1621 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   1622 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   1623 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   1624 
   1625 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   1626 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   1627 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   1628 
   1629 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   1630 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   1631 	ldr	q21, [x3, #144]				//load h6k | h5k
   1632 	ldr	q24, [x3, #192]				//load h8k | h7k
   1633 
   1634 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   1635 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   1636 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   1637 
   1638 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   1639 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   1640 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   1641 
   1642 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   1643 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   1644 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   1645 
   1646 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   1647 	aese	v4.16b, v26.16b
   1648 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   1649 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   1650 
   1651 	aese	v6.16b, v26.16b
   1652 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   1653 	aese	v5.16b, v26.16b
   1654 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   1655 	aese	v7.16b, v26.16b
   1656 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   1657 
   1658 	aese	v3.16b, v26.16b
   1659 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   1660 	aese	v2.16b, v26.16b
   1661 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   1662 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   1663 
   1664 	aese	v1.16b, v26.16b
   1665 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   1666 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   1667 	aese	v0.16b, v26.16b
   1668 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   1669 
   1670 	aese	v2.16b, v27.16b
   1671 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   1672 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   1673 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   1674 
   1675 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   1676 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   1677 	aese	v7.16b, v27.16b
   1678 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   1679 
   1680 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   1681 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   1682 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   1683 
   1684 	ldr	q23, [x3, #80]				//load h3l | h3h
   1685 	ext	v23.16b, v23.16b, v23.16b, #8
   1686 	ldr	q25, [x3, #112]				//load h4l | h4h
   1687 	ext	v25.16b, v25.16b, v25.16b, #8
   1688 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   1689 	aese	v6.16b, v27.16b
   1690 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   1691 
   1692 	aese	v4.16b, v27.16b
   1693 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   1694 	aese	v5.16b, v27.16b
   1695 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   1696 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   1697 
   1698 	aese	v3.16b, v27.16b
   1699 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   1700 	aese	v0.16b, v27.16b
   1701 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   1702 	aese	v1.16b, v27.16b
   1703 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   1704 
   1705 	aese	v7.16b, v28.16b
   1706 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   1707 	aese	v2.16b, v28.16b
   1708 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   1709 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   1710 
   1711 	aese	v4.16b, v28.16b
   1712 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   1713 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   1714 	ldr	q20, [x3, #32]				//load h1l | h1h
   1715 	ext	v20.16b, v20.16b, v20.16b, #8
   1716 	ldr	q22, [x3, #64]				//load h2l | h2h
   1717 	ext	v22.16b, v22.16b, v22.16b, #8
   1718 
   1719 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   1720 	aese	v1.16b, v28.16b
   1721 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   1722 	aese	v3.16b, v28.16b
   1723 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   1724 
   1725 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   1726 	aese	v5.16b, v28.16b
   1727 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   1728 	aese	v0.16b, v28.16b
   1729 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   1730 
   1731 	aese	v6.16b, v28.16b
   1732 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   1733 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   1734 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   1735 
   1736 	aese	v7.16b, v26.16b
   1737 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   1738 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   1739 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   1740 
   1741 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   1742 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   1743 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   1744 
   1745 	ldr	q21, [x3, #48]				//load h2k | h1k
   1746 	ldr	q24, [x3, #96]				//load h4k | h3k
   1747 	aese	v2.16b, v26.16b
   1748 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   1749 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   1750 
   1751 	aese	v4.16b, v26.16b
   1752 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   1753 	aese	v3.16b, v26.16b
   1754 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   1755 	aese	v1.16b, v26.16b
   1756 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   1757 
   1758 	aese	v0.16b, v26.16b
   1759 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   1760 	aese	v6.16b, v26.16b
   1761 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   1762 	aese	v5.16b, v26.16b
   1763 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   1764 
   1765 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   1766 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   1767 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   1768 
   1769 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   1770 	aese	v0.16b, v27.16b
   1771 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   1772 	aese	v7.16b, v27.16b
   1773 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   1774 
   1775 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   1776 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   1777 	aese	v3.16b, v27.16b
   1778 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   1779 
   1780 	aese	v1.16b, v27.16b
   1781 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   1782 	aese	v5.16b, v27.16b
   1783 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   1784 	aese	v6.16b, v27.16b
   1785 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   1786 
   1787 	aese	v2.16b, v27.16b
   1788 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   1789 	aese	v4.16b, v27.16b
   1790 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   1791 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   1792 
   1793 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   1794 	aese	v0.16b, v28.16b
   1795 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   1796 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   1797 
   1798 	aese	v2.16b, v28.16b
   1799 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   1800 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   1801 	aese	v1.16b, v28.16b
   1802 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   1803 
   1804 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   1805 	aese	v6.16b, v28.16b
   1806 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   1807 	aese	v7.16b, v28.16b
   1808 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   1809 
   1810 	aese	v3.16b, v28.16b
   1811 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   1812 	aese	v5.16b, v28.16b
   1813 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   1814 	aese	v4.16b, v28.16b
   1815 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   1816 
   1817 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   1818 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b 			//GHASH block 8k+4, 8k+5 - mid
   1819 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   1820 
   1821 	aese	v3.16b, v26.16b
   1822 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   1823 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   1824 	aese	v7.16b, v26.16b
   1825 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   1826 
   1827 	aese	v1.16b, v26.16b
   1828 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   1829 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   1830 	aese	v6.16b, v26.16b
   1831 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   1832 
   1833 	aese	v2.16b, v26.16b
   1834 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   1835 	aese	v5.16b, v26.16b
   1836 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   1837 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   1838 
   1839 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   1840 	aese	v0.16b, v26.16b
   1841 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   1842 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   1843 
   1844 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   1845 	aese	v4.16b, v26.16b
   1846 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   1847 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   1848 
   1849 	ldr	d16, [x10]			//MODULO - load modulo constant
   1850 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   1851 	aese	v5.16b, v27.16b
   1852 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   1853 
   1854 	rev32	v20.16b, v30.16b					//CTR block 8k+16
   1855 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   1856 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
   1857 
   1858 	aese	v6.16b, v27.16b
   1859 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   1860 	aese	v3.16b, v27.16b
   1861 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   1862 	aese	v7.16b, v27.16b
   1863 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   1864 
   1865 	aese	v2.16b, v27.16b
   1866 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   1867 	aese	v1.16b, v27.16b
   1868 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   1869 	rev32	v22.16b, v30.16b					//CTR block 8k+17
   1870 
   1871 	aese	v4.16b, v27.16b
   1872 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   1873 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   1874 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   1875 
   1876 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   1877 	aese	v0.16b, v27.16b
   1878 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   1879 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
   1880 
   1881 	aese	v5.16b, v28.16b
   1882 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   1883 	aese	v1.16b, v28.16b
   1884 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   1885 	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
   1886 
   1887 	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
   1888 	aese	v0.16b, v28.16b
   1889 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   1890 	rev32	v23.16b, v30.16b					//CTR block 8k+18
   1891 
   1892 	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
   1893 	aese	v4.16b, v28.16b
   1894 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   1895 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   1896 
   1897 	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
   1898 	aese	v3.16b, v28.16b
   1899 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   1900 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
   1901 
   1902 	aese	v7.16b, v28.16b
   1903 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   1904 	aese	v2.16b, v28.16b
   1905 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   1906 	aese	v6.16b, v28.16b
   1907 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   1908 
   1909 	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
   1910 	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
   1911 	ldr	q27, [x8, #160]					//load rk10
   1912 
   1913 	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
   1914 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   1915 	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
   1916 
   1917 	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
   1918 	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
   1919 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   1920 
   1921 	rev32	v25.16b, v30.16b					//CTR block 8k+19
   1922 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
   1923 
   1924 	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
   1925 	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
   1926 .inst	0xce016d21	//eor3 v1.16b, v9.16b, v1.16b, v27.16b				//AES block 8k+9 - result
   1927 
   1928 .inst	0xce006d00	//eor3 v0.16b, v8.16b, v0.16b, v27.16b				//AES block 8k+8 - result
   1929 .inst	0xce076de7	//eor3 v7.16b, v15.16b, v7.16b, v27.16b				//AES block 8k+15 - result
   1930 .inst	0xce066dc6	//eor3 v6.16b, v14.16b, v6.16b, v27.16b				//AES block 8k+14 - result
   1931 
   1932 .inst	0xce026d42	//eor3 v2.16b, v10.16b, v2.16b, v27.16b				//AES block 8k+10 - result
   1933 	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
   1934 	mov	v1.16b, v22.16b					//CTR block 8k+17
   1935 
   1936 .inst	0xce046d84	//eor3 v4.16b, v12.16b, v4.16b, v27.16b				//AES block 8k+12 - result
   1937 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   1938 	mov	v0.16b, v20.16b					//CTR block 8k+16
   1939 
   1940 .inst	0xce036d63	//eor3 v3.16b, v11.16b, v3.16b, v27.16b				//AES block 8k+11 - result
   1941 	cmp	x0, x5				//.LOOP CONTROL
   1942 	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result
   1943 
   1944 .inst	0xce056da5	//eor3 v5.16b, v13.16b, v5.16b, v27.16b				//AES block 8k+13 - result
   1945 	mov	v2.16b, v23.16b					//CTR block 8k+18
   1946 
   1947 	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
   1948 	rev32	v4.16b, v30.16b				//CTR block 8k+20
   1949 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
   1950 
   1951 	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
   1952 	mov	v3.16b, v25.16b					//CTR block 8k+19
   1953 	b.lt	.L128_dec_main_loop
   1954 
   1955 .L128_dec_prepretail:	//PREPRETAIL
   1956 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   1957 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   1958 	rev64	v8.16b, v8.16b						//GHASH block 8k
   1959 
   1960 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   1961 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   1962 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   1963 
   1964 	ldr	q23, [x3, #176]				//load h7l | h7h
   1965 	ext	v23.16b, v23.16b, v23.16b, #8
   1966 	ldr	q25, [x3, #208]				//load h8l | h8h
   1967 	ext	v25.16b, v25.16b, v25.16b, #8
   1968 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   1969 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   1970 
   1971 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   1972 	ldr	q20, [x3, #128]				//load h5l | h5h
   1973 	ext	v20.16b, v20.16b, v20.16b, #8
   1974 	ldr	q22, [x3, #160]				//load h6l | h6h
   1975 	ext	v22.16b, v22.16b, v22.16b, #8
   1976 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   1977 
   1978 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   1979 
   1980 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   1981 
   1982 	ldr	q21, [x3, #144]				//load h6k | h5k
   1983 	ldr	q24, [x3, #192]				//load h8k | h7k
   1984 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   1985 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   1986 
   1987 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   1988 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   1989 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   1990 
   1991 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   1992 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   1993 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   1994 
   1995 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   1996 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   1997 	aese	v0.16b, v26.16b
   1998 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   1999 
   2000 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   2001 	aese	v4.16b, v26.16b
   2002 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   2003 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   2004 
   2005 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   2006 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   2007 	aese	v3.16b, v26.16b
   2008 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   2009 
   2010 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   2011 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   2012 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   2013 
   2014 	aese	v2.16b, v26.16b
   2015 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   2016 	aese	v1.16b, v26.16b
   2017 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   2018 	aese	v5.16b, v26.16b
   2019 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   2020 
   2021 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k - mid
   2022 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   2023 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   2024 
   2025 	aese	v2.16b, v27.16b
   2026 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   2027 	aese	v7.16b, v26.16b
   2028 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   2029 	aese	v6.16b, v26.16b
   2030 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   2031 
   2032 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   2033 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   2034 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   2035 
   2036 	aese	v6.16b, v27.16b
   2037 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   2038 	aese	v4.16b, v27.16b
   2039 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   2040 	aese	v5.16b, v27.16b
   2041 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   2042 
   2043 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   2044 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   2045 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   2046 
   2047 	ldr	q23, [x3, #80]				//load h3l | h3h
   2048 	ext	v23.16b, v23.16b, v23.16b, #8
   2049 	ldr	q25, [x3, #112]				//load h4l | h4h
   2050 	ext	v25.16b, v25.16b, v25.16b, #8
   2051 	aese	v1.16b, v27.16b
   2052 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   2053 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   2054 
   2055 	aese	v3.16b, v27.16b
   2056 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   2057 	aese	v7.16b, v27.16b
   2058 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   2059 	aese	v0.16b, v27.16b
   2060 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   2061 
   2062 	ldr	q20, [x3, #32]				//load h1l | h1h
   2063 	ext	v20.16b, v20.16b, v20.16b, #8
   2064 	ldr	q22, [x3, #64]				//load h2l | h2h
   2065 	ext	v22.16b, v22.16b, v22.16b, #8
   2066 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   2067 
   2068 	aese	v0.16b, v28.16b
   2069 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   2070 	aese	v6.16b, v28.16b
   2071 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   2072 	aese	v2.16b, v28.16b
   2073 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   2074 
   2075 	aese	v4.16b, v28.16b
   2076 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   2077 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   2078 	aese	v7.16b, v28.16b
   2079 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   2080 
   2081 	aese	v1.16b, v28.16b
   2082 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   2083 	aese	v5.16b, v28.16b
   2084 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   2085 	aese	v3.16b, v28.16b
   2086 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   2087 
   2088 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   2089 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   2090 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   2091 
   2092 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   2093 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   2094 	aese	v6.16b, v26.16b
   2095 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   2096 
   2097 	ldr	q21, [x3, #48]				//load h2k | h1k
   2098 	ldr	q24, [x3, #96]				//load h4k | h3k
   2099 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   2100 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   2101 
   2102 	aese	v2.16b, v26.16b
   2103 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   2104 	aese	v0.16b, v26.16b
   2105 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   2106 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   2107 
   2108 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   2109 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   2110 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   2111 
   2112 	aese	v4.16b, v26.16b
   2113 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   2114 	aese	v3.16b, v26.16b
   2115 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   2116 	aese	v7.16b, v26.16b
   2117 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   2118 
   2119 	aese	v1.16b, v26.16b
   2120 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   2121 	aese	v5.16b, v26.16b
   2122 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   2123 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   2124 
   2125 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   2126 	aese	v0.16b, v27.16b
   2127 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   2128 	aese	v2.16b, v27.16b
   2129 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   2130 
   2131 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   2132 	aese	v5.16b, v27.16b
   2133 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   2134 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   2135 
   2136 	aese	v1.16b, v27.16b
   2137 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   2138 	aese	v6.16b, v27.16b
   2139 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   2140 	aese	v4.16b, v27.16b
   2141 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   2142 
   2143 	aese	v7.16b, v27.16b
   2144 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   2145 	aese	v3.16b, v27.16b
   2146 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   2147 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   2148 
   2149 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   2150 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   2151 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   2152 
   2153 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   2154 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   2155 	aese	v6.16b, v28.16b
   2156 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   2157 
   2158 	ldr	d16, [x10]			//MODULO - load modulo constant
   2159 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   2160 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   2161 
   2162 	aese	v0.16b, v28.16b
   2163 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   2164 	aese	v2.16b, v28.16b
   2165 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   2166 	aese	v4.16b, v28.16b
   2167 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   2168 
   2169 	aese	v3.16b, v28.16b
   2170 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   2171 	aese	v1.16b, v28.16b
   2172 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   2173 	aese	v5.16b, v28.16b
   2174 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   2175 
   2176 	aese	v7.16b, v28.16b
   2177 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   2178 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   2179 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   2180 
   2181 	aese	v4.16b, v26.16b
   2182 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   2183 	aese	v1.16b, v26.16b
   2184 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   2185 	aese	v2.16b, v26.16b
   2186 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   2187 
   2188 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   2189 	aese	v5.16b, v26.16b
   2190 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   2191 	aese	v0.16b, v26.16b
   2192 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   2193 
   2194 	aese	v3.16b, v26.16b
   2195 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   2196 	aese	v6.16b, v26.16b
   2197 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   2198 	aese	v7.16b, v26.16b
   2199 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   2200 
   2201 	aese	v4.16b, v27.16b
   2202 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   2203 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   2204 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   2205 
   2206 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   2207 	aese	v3.16b, v27.16b
   2208 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   2209 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   2210 
   2211 	aese	v5.16b, v27.16b
   2212 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   2213 	aese	v6.16b, v27.16b
   2214 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   2215 	aese	v0.16b, v27.16b
   2216 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   2217 
   2218 	aese	v7.16b, v27.16b
   2219 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   2220 	aese	v1.16b, v27.16b
   2221 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   2222 	aese	v2.16b, v27.16b
   2223 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   2224 
   2225 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   2226 	ldr	q27, [x8, #160]					//load rk10
   2227 
   2228 	aese	v3.16b, v28.16b
   2229 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   2230 	aese	v0.16b, v28.16b
   2231 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   2232 
   2233 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   2234 	aese	v6.16b, v28.16b
   2235 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   2236 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   2237 
   2238 	aese	v2.16b, v28.16b
   2239 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   2240 	aese	v1.16b, v28.16b
   2241 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   2242 	aese	v7.16b, v28.16b
   2243 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   2244 
   2245 	aese	v6.16b, v26.16b						//AES block 8k+14 - round 9
   2246 	aese	v5.16b, v28.16b
   2247 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   2248 	aese	v4.16b, v28.16b
   2249 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   2250 
   2251 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   2252 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   2253 	aese	v2.16b, v26.16b						//AES block 8k+10 - round 9
   2254 
   2255 	aese	v3.16b, v26.16b						//AES block 8k+11 - round 9
   2256 	aese	v5.16b, v26.16b						//AES block 8k+13 - round 9
   2257 	aese	v0.16b, v26.16b						//AES block 8k+8 - round 9
   2258 
   2259 	aese	v4.16b, v26.16b						//AES block 8k+12 - round 9
   2260 	aese	v1.16b, v26.16b						//AES block 8k+9 - round 9
   2261 	aese	v7.16b, v26.16b						//AES block 8k+15 - round 9
   2262 
   2263 .L128_dec_tail:	//TAIL
   2264 
   2265 	mov	v29.16b, v27.16b
   2266 	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
   2267 
   2268 	cmp	x5, #112
   2269 
   2270 	ldp	q24, q25, [x3, #192]			//load h8k | h7k
   2271 	ext	v25.16b, v25.16b, v25.16b, #8
   2272 	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext
   2273 
   2274 	ldp	q20, q21, [x3, #128]			//load h5l | h5h
   2275 	ext	v20.16b, v20.16b, v20.16b, #8
   2276 	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
   2277 
   2278 	ldp	q22, q23, [x3, #160]			//load h6l | h6h
   2279 	ext	v22.16b, v22.16b, v22.16b, #8
   2280 	ext	v23.16b, v23.16b, v23.16b, #8
   2281 
   2282 .inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
   2283 	b.gt	.L128_dec_blocks_more_than_7
   2284 
   2285 	cmp	x5, #96
   2286 	mov	v7.16b, v6.16b
   2287 	movi	v19.8b, #0
   2288 
   2289 	movi	v17.8b, #0
   2290 	mov	v6.16b, v5.16b
   2291 	mov	v5.16b, v4.16b
   2292 
   2293 	mov	v4.16b, v3.16b
   2294 	mov	v3.16b, v2.16b
   2295 	mov	v2.16b, v1.16b
   2296 
   2297 	movi	v18.8b, #0
   2298 	sub	v30.4s, v30.4s, v31.4s
   2299 	b.gt	.L128_dec_blocks_more_than_6
   2300 
   2301 	cmp	x5, #80
   2302 	sub	v30.4s, v30.4s, v31.4s
   2303 
   2304 	mov	v7.16b, v6.16b
   2305 	mov	v6.16b, v5.16b
   2306 	mov	v5.16b, v4.16b
   2307 
   2308 	mov	v4.16b, v3.16b
   2309 	mov	v3.16b, v1.16b
   2310 	b.gt	.L128_dec_blocks_more_than_5
   2311 
   2312 	cmp	x5, #64
   2313 
   2314 	mov	v7.16b, v6.16b
   2315 	mov	v6.16b, v5.16b
   2316 	mov	v5.16b, v4.16b
   2317 
   2318 	mov	v4.16b, v1.16b
   2319 	sub	v30.4s, v30.4s, v31.4s
   2320 	b.gt	.L128_dec_blocks_more_than_4
   2321 
   2322 	sub	v30.4s, v30.4s, v31.4s
   2323 	mov	v7.16b, v6.16b
   2324 	mov	v6.16b, v5.16b
   2325 
   2326 	mov	v5.16b, v1.16b
   2327 	cmp	x5, #48
   2328 	b.gt	.L128_dec_blocks_more_than_3
   2329 
   2330 	sub	v30.4s, v30.4s, v31.4s
   2331 	mov	v7.16b, v6.16b
   2332 	cmp	x5, #32
   2333 
   2334 	ldr	q24, [x3, #96]				//load h4k | h3k
   2335 	mov	v6.16b, v1.16b
   2336 	b.gt	.L128_dec_blocks_more_than_2
   2337 
   2338 	cmp	x5, #16
   2339 
   2340 	mov	v7.16b, v1.16b
   2341 	sub	v30.4s, v30.4s, v31.4s
   2342 	b.gt	.L128_dec_blocks_more_than_1
   2343 
   2344 	sub	v30.4s, v30.4s, v31.4s
   2345 	ldr	q21, [x3, #48]				//load h2k | h1k
   2346 	b	.L128_dec_blocks_less_than_1
   2347 .L128_dec_blocks_more_than_7:	//blocks	left >  7
   2348 	rev64	v8.16b, v9.16b						//GHASH final-7 block
   2349 
   2350 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2351 
   2352 	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
   2353 
   2354 	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
   2355 	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
   2356 
   2357 	movi	v16.8b, #0						//suppress further partial tag feed in
   2358 	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
   2359 
   2360 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
   2361 
   2362 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
   2363 	st1	{ v12.16b}, [x2], #16			 	//AES final-7 block  - store result
   2364 .inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result
   2365 
   2366 	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
   2367 .L128_dec_blocks_more_than_6:	//blocks	left >  6
   2368 
   2369 	rev64	v8.16b, v9.16b						//GHASH final-6 block
   2370 
   2371 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2372 
   2373 	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
   2374 
   2375 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
   2376 
   2377 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
   2378 	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
   2379 	movi	v16.8b, #0						//suppress further partial tag feed in
   2380 
   2381 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
   2382 	st1	{ v12.16b}, [x2], #16			 	//AES final-6 block - store result
   2383 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
   2384 
   2385 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
   2386 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
   2387 
   2388 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
   2389 .inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
   2390 .L128_dec_blocks_more_than_5:	//blocks	left >  5
   2391 
   2392 	rev64	v8.16b, v9.16b						//GHASH final-5 block
   2393 
   2394 	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
   2395 	st1	{ v12.16b}, [x2], #16			 	//AES final-5 block - store result
   2396 
   2397 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2398 
   2399 	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
   2400 
   2401 .inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
   2402 
   2403 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
   2404 
   2405 	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
   2406 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
   2407 	movi	v16.8b, #0						//suppress further partial tag feed in
   2408 
   2409 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
   2410 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
   2411 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
   2412 
   2413 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
   2414 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
   2415 .L128_dec_blocks_more_than_4:	//blocks	left >  4
   2416 
   2417 	rev64	v8.16b, v9.16b						//GHASH final-4 block
   2418 
   2419 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2420 	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
   2421 
   2422 	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
   2423 	movi	v16.8b, #0						//suppress further partial tag feed in
   2424 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
   2425 
   2426 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
   2427 
   2428 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
   2429 
   2430 	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
   2431 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
   2432 
   2433 .inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
   2434 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
   2435 
   2436 	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
   2437 
   2438 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
   2439 .L128_dec_blocks_more_than_3:	//blocks	left >  3
   2440 
   2441 	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
   2442 	rev64	v8.16b, v9.16b						//GHASH final-3 block
   2443 
   2444 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2445 
   2446 	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
   2447 
   2448 	ldr	q25, [x3, #112]				//load h4l | h4h
   2449 	ext	v25.16b, v25.16b, v25.16b, #8
   2450 	ldr	q24, [x3, #96]				//load h4k | h3k
   2451 
   2452 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
   2453 
   2454 	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
   2455 
   2456 	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
   2457 	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
   2458 	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
   2459 
   2460 	movi	v16.8b, #0						//suppress further partial tag feed in
   2461 .inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
   2462 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
   2463 
   2464 	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
   2465 
   2466 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
   2467 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
   2468 .L128_dec_blocks_more_than_2:	//blocks	left >  2
   2469 
   2470 	rev64	v8.16b, v9.16b						//GHASH final-2 block
   2471 
   2472 	st1	{ v12.16b}, [x2], #16			 	//AES final-2 block - store result
   2473 
   2474 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2475 	ldr	q23, [x3, #80]				//load h3l | h3h
   2476 	ext	v23.16b, v23.16b, v23.16b, #8
   2477 	movi	v16.8b, #0						//suppress further partial tag feed in
   2478 
   2479 	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
   2480 
   2481 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
   2482 
   2483 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
   2484 
   2485 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
   2486 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
   2487 	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext
   2488 
   2489 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
   2490 
   2491 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
   2492 
   2493 .inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
   2494 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
   2495 .L128_dec_blocks_more_than_1:	//blocks	left >  1
   2496 
   2497 	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
   2498 	rev64	v8.16b, v9.16b						//GHASH final-1 block
   2499 
   2500 	ldr	q22, [x3, #64]				//load h2l | h2h
   2501 	ext	v22.16b, v22.16b, v22.16b, #8
   2502 
   2503 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2504 
   2505 	movi	v16.8b, #0						//suppress further partial tag feed in
   2506 
   2507 	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
   2508 
   2509 	ldr	q9, [x0], #16				//AES final block - load ciphertext
   2510 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
   2511 
   2512 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
   2513 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
   2514 	ldr	q21, [x3, #48]				//load h2k | h1k
   2515 
   2516 	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
   2517 .inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
   2518 
   2519 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
   2520 
   2521 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
   2522 
   2523 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
   2524 
   2525 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
   2526 .L128_dec_blocks_less_than_1:	//blocks	left <= 1
   2527 
   2528 	and	x1, x1, #127				//bit_length %= 128
   2529 
   2530 	sub	x1, x1, #128				//bit_length -= 128
   2531 
   2532 	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
   2533 
   2534 	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
   2535 	and	x1, x1, #127				//bit_length %= 128
   2536 
   2537 	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
   2538 	cmp	x1, #64
   2539 	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
   2540 
   2541 	csel	x13, x7, x6, lt
   2542 	csel	x14, x6, xzr, lt
   2543 
   2544 	mov	v0.d[1], x14
   2545 	mov	v0.d[0], x13					//ctr0b is mask for last block
   2546 
   2547 	ldr	q20, [x3, #32]				//load h1l | h1h
   2548 	ext	v20.16b, v20.16b, v20.16b, #8
   2549 	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
   2550 
   2551 	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
   2552 
   2553 	rev64	v8.16b, v9.16b						//GHASH final block
   2554 
   2555 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   2556 
   2557 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
   2558 	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
   2559 
   2560 	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
   2561 	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
   2562 
   2563 	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
   2564 
   2565 	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
   2566 	st1	{ v12.16b}, [x2]				//store all 16B
   2567 
   2568 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
   2569 
   2570 	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
   2571 	ldr	d16, [x10]			//MODULO - load modulo constant
   2572 
   2573 	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
   2574 
   2575 	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
   2576 
   2577 	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   2578 	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   2579 
   2580 	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up
   2581 
   2582 .inst	0xce115652	//eor3 v18.16b, v18.16b, v17.16b, v21.16b			//MODULO - fold into mid
   2583 
   2584 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   2585 	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   2586 
   2587 .inst	0xce124673	//eor3 v19.16b, v19.16b, v18.16b, v17.16b			//MODULO - fold into low
   2588 	ext	v19.16b, v19.16b, v19.16b, #8
   2589 	rev64	v19.16b, v19.16b
   2590 	st1	{ v19.16b }, [x3]
   2591 	rev32	v30.16b, v30.16b
   2592 
   2593 	str	q30, [x16]					//store the updated counter
   2594 
   2595 	mov	x0, x9
   2596 
   2597 	ldp	d10, d11, [sp, #16]
   2598 	ldp	d12, d13, [sp, #32]
   2599 	ldp	d14, d15, [sp, #48]
   2600 	ldp	d8, d9, [sp], #80
   2601 	ret
   2602 .L128_dec_ret:
   2603 	mov	w0, #0x0
   2604 	ret
   2605 .size	unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
   2606 .globl	unroll8_eor3_aes_gcm_enc_192_kernel
   2607 .type	unroll8_eor3_aes_gcm_enc_192_kernel,%function
   2608 .align	4
   2609 unroll8_eor3_aes_gcm_enc_192_kernel:
   2610 	AARCH64_VALID_CALL_TARGET
   2611 	cbz	x1, .L192_enc_ret
   2612 	stp	d8, d9, [sp, #-80]!
   2613 	lsr	x9, x1, #3
   2614 	mov	x16, x4
   2615 	mov	x8, x5
   2616 	stp	d10, d11, [sp, #16]
   2617 	stp	d12, d13, [sp, #32]
   2618 	stp	d14, d15, [sp, #48]
   2619 	mov	x5, #0xc200000000000000
   2620 	stp	x5, xzr, [sp, #64]
   2621 	add	x10, sp, #64
   2622 
   2623 	mov	x5, x9
   2624 	ld1	{ v0.16b}, [x16]					//CTR block 0
   2625 
   2626 	mov	x15, #0x100000000				//set up counter increment
   2627 	movi	v31.16b, #0x0
   2628 	mov	v31.d[1], x15
   2629 
   2630 	rev32	v30.16b, v0.16b				//set up reversed counter
   2631 
   2632 	add	v30.4s, v30.4s, v31.4s		//CTR block 0
   2633 
   2634 	rev32	v1.16b, v30.16b				//CTR block 1
   2635 	add	v30.4s, v30.4s, v31.4s		//CTR block 1
   2636 
   2637 	rev32	v2.16b, v30.16b				//CTR block 2
   2638 	add	v30.4s, v30.4s, v31.4s		//CTR block 2
   2639 
   2640 	rev32	v3.16b, v30.16b				//CTR block 3
   2641 	add	v30.4s, v30.4s, v31.4s		//CTR block 3
   2642 
   2643 	rev32	v4.16b, v30.16b				//CTR block 4
   2644 	add	v30.4s, v30.4s, v31.4s		//CTR block 4
   2645 	sub	x5, x5, #1		//byte_len - 1
   2646 
   2647 	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   2648 
   2649 	rev32	v5.16b, v30.16b				//CTR block 5
   2650 	add	v30.4s, v30.4s, v31.4s		//CTR block 5
   2651 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   2652 
   2653 	add	x5, x5, x0
   2654 
   2655 	rev32	v6.16b, v30.16b				//CTR block 6
   2656 	add	v30.4s, v30.4s, v31.4s		//CTR block 6
   2657 
   2658 	rev32	v7.16b, v30.16b				//CTR block 7
   2659 
   2660 	aese	v5.16b, v26.16b
   2661 	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
   2662 	aese	v4.16b, v26.16b
   2663 	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
   2664 	aese	v3.16b, v26.16b
   2665 	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
   2666 
   2667 	aese	v0.16b, v26.16b
   2668 	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
   2669 	aese	v1.16b, v26.16b
   2670 	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
   2671 	aese	v7.16b, v26.16b
   2672 	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
   2673 
   2674 	aese	v6.16b, v26.16b
   2675 	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
   2676 	aese	v2.16b, v26.16b
   2677 	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
   2678 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   2679 
   2680 	aese	v5.16b, v27.16b
   2681 	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
   2682 	aese	v7.16b, v27.16b
   2683 	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
   2684 
   2685 	aese	v2.16b, v27.16b
   2686 	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
   2687 	aese	v3.16b, v27.16b
   2688 	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
   2689 	aese	v6.16b, v27.16b
   2690 	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
   2691 
   2692 	aese	v5.16b, v28.16b
   2693 	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
   2694 	aese	v4.16b, v27.16b
   2695 	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
   2696 	aese	v0.16b, v27.16b
   2697 	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
   2698 
   2699 	aese	v1.16b, v27.16b
   2700 	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
   2701 	aese	v7.16b, v28.16b
   2702 	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
   2703 	aese	v3.16b, v28.16b
   2704 	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
   2705 
   2706 	aese	v2.16b, v28.16b
   2707 	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
   2708 	aese	v0.16b, v28.16b
   2709 	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
   2710 
   2711 	aese	v1.16b, v28.16b
   2712 	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
   2713 	aese	v4.16b, v28.16b
   2714 	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
   2715 	aese	v6.16b, v28.16b
   2716 	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
   2717 
   2718 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   2719 	aese	v4.16b, v26.16b
   2720 	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
   2721 
   2722 	aese	v7.16b, v26.16b
   2723 	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
   2724 	aese	v3.16b, v26.16b
   2725 	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
   2726 	aese	v2.16b, v26.16b
   2727 	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
   2728 
   2729 	aese	v1.16b, v26.16b
   2730 	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
   2731 
   2732 	aese	v0.16b, v26.16b
   2733 	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
   2734 
   2735 	aese	v6.16b, v26.16b
   2736 	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
   2737 
   2738 	aese	v0.16b, v27.16b
   2739 	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
   2740 	aese	v1.16b, v27.16b
   2741 	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
   2742 	aese	v5.16b, v26.16b
   2743 	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
   2744 
   2745 	aese	v3.16b, v27.16b
   2746 	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
   2747 	aese	v2.16b, v27.16b
   2748 	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
   2749 	aese	v4.16b, v27.16b
   2750 	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
   2751 
   2752 	aese	v6.16b, v27.16b
   2753 	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
   2754 	aese	v7.16b, v27.16b
   2755 	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
   2756 	aese	v5.16b, v27.16b
   2757 	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
   2758 
   2759 	aese	v1.16b, v28.16b
   2760 	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
   2761 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   2762 	aese	v2.16b, v28.16b
   2763 	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
   2764 
   2765 	aese	v4.16b, v28.16b
   2766 	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
   2767 	aese	v7.16b, v28.16b
   2768 	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
   2769 	aese	v0.16b, v28.16b
   2770 	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
   2771 
   2772 	aese	v5.16b, v28.16b
   2773 	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
   2774 	aese	v6.16b, v28.16b
   2775 	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
   2776 	aese	v3.16b, v28.16b
   2777 	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
   2778 
   2779 	add	v30.4s, v30.4s, v31.4s		//CTR block 7
   2780 
   2781 	aese	v5.16b, v26.16b
   2782 	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
   2783 	aese	v4.16b, v26.16b
   2784 	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
   2785 	aese	v3.16b, v26.16b
   2786 	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
   2787 
   2788 	aese	v2.16b, v26.16b
   2789 	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
   2790 	aese	v6.16b, v26.16b
   2791 	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
   2792 	aese	v1.16b, v26.16b
   2793 	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
   2794 
   2795 	aese	v0.16b, v26.16b
   2796 	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
   2797 	aese	v7.16b, v26.16b
   2798 	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
   2799 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   2800 
   2801 	aese	v6.16b, v27.16b
   2802 	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
   2803 	aese	v3.16b, v27.16b
   2804 	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
   2805 
   2806 	aese	v4.16b, v27.16b
   2807 	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
   2808 	aese	v0.16b, v27.16b
   2809 	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
   2810 
   2811 	aese	v7.16b, v27.16b
   2812 	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
   2813 	aese	v1.16b, v27.16b
   2814 	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
   2815 
   2816 	aese	v2.16b, v27.16b
   2817 	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
   2818 	aese	v5.16b, v27.16b
   2819 	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
   2820 
   2821 	aese	v7.16b, v28.16b
   2822 	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
   2823 	aese	v0.16b, v28.16b
   2824 	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
   2825 
   2826 	aese	v4.16b, v28.16b
   2827 	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
   2828 	aese	v3.16b, v28.16b
   2829 	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
   2830 	aese	v5.16b, v28.16b
   2831 	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
   2832 
   2833 	aese	v2.16b, v28.16b
   2834 	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
   2835 	aese	v1.16b, v28.16b
   2836 	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
   2837 	aese	v6.16b, v28.16b
   2838 	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
   2839 
   2840 	add	x4, x0, x1, lsr #3		//end_input_ptr
   2841 	cmp	x0, x5				//check if we have <= 8 blocks
   2842 	aese	v3.16b, v26.16b
   2843 	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
   2844 
   2845 	ld1	{ v19.16b}, [x3]
   2846 	ext	v19.16b, v19.16b, v19.16b, #8
   2847 	rev64	v19.16b, v19.16b
   2848 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   2849 
   2850 	aese	v6.16b, v26.16b
   2851 	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
   2852 	aese	v1.16b, v26.16b
   2853 	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
   2854 
   2855 	aese	v5.16b, v26.16b
   2856 	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
   2857 	aese	v2.16b, v26.16b
   2858 	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
   2859 
   2860 	aese	v0.16b, v26.16b
   2861 	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
   2862 	aese	v4.16b, v26.16b
   2863 	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
   2864 
   2865 	aese	v6.16b, v27.16b
   2866 	aesmc	v6.16b, v6.16b			//AES block 14 - round 10
   2867 	aese	v7.16b, v26.16b
   2868 	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
   2869 	aese	v3.16b, v27.16b
   2870 	aesmc	v3.16b, v3.16b			//AES block 11 - round 10
   2871 
   2872 	aese	v1.16b, v27.16b
   2873 	aesmc	v1.16b, v1.16b			//AES block 9 - round 10
   2874 	aese	v5.16b, v27.16b
   2875 	aesmc	v5.16b, v5.16b			//AES block 13 - round 10
   2876 	aese	v4.16b, v27.16b
   2877 	aesmc	v4.16b, v4.16b			//AES block 12 - round 10
   2878 
   2879 	aese	v0.16b, v27.16b
   2880 	aesmc	v0.16b, v0.16b			//AES block 8 - round 10
   2881 	aese	v2.16b, v27.16b
   2882 	aesmc	v2.16b, v2.16b			//AES block 10 - round 10
   2883 	aese	v7.16b, v27.16b
   2884 	aesmc	v7.16b, v7.16b			//AES block 15 - round 10
   2885 
   2886 	aese	v6.16b, v28.16b						//AES block 14 - round 11
   2887 	aese	v3.16b, v28.16b						//AES block 11 - round 11
   2888 
   2889 	aese	v4.16b, v28.16b						//AES block 12 - round 11
   2890 	aese	v7.16b, v28.16b						//AES block 15 - round 11
   2891 	ldr	q26, [x8, #192]					//load rk12
   2892 
   2893 	aese	v1.16b, v28.16b						//AES block 9 - round 11
   2894 	aese	v5.16b, v28.16b						//AES block 13 - round 11
   2895 
   2896 	aese	v2.16b, v28.16b						//AES block 10 - round 11
   2897 	aese	v0.16b, v28.16b						//AES block 8 - round 11
   2898 	b.ge	.L192_enc_tail						//handle tail
   2899 
   2900 	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext
   2901 
   2902 	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext
   2903 
   2904 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
   2905 
   2906 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
   2907 
   2908 .inst	0xce006908	//eor3 v8.16b, v8.16b, v0.16b, v26.16b				//AES block 0 - result
   2909 	rev32	v0.16b, v30.16b				//CTR block 8
   2910 	add	v30.4s, v30.4s, v31.4s		//CTR block 8
   2911 
   2912 .inst	0xce03696b	//eor3 v11.16b, v11.16b, v3.16b, v26.16b				//AES block 3 - result
   2913 .inst	0xce016929	//eor3 v9.16b, v9.16b, v1.16b, v26.16b				//AES block 1 - result
   2914 
   2915 	rev32	v1.16b, v30.16b				//CTR block 9
   2916 	add	v30.4s, v30.4s, v31.4s		//CTR block 9
   2917 .inst	0xce04698c	//eor3 v12.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result
   2918 
   2919 .inst	0xce0569ad	//eor3 v13.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
   2920 .inst	0xce0769ef	//eor3 v15.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
   2921 	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result
   2922 
   2923 .inst	0xce02694a	//eor3 v10.16b, v10.16b, v2.16b, v26.16b				//AES block 2 - result
   2924 	rev32	v2.16b, v30.16b				//CTR block 10
   2925 	add	v30.4s, v30.4s, v31.4s		//CTR block 10
   2926 
   2927 	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
   2928 	cmp	x0, x5				//check if we have <= 8 blocks
   2929 
   2930 	rev32	v3.16b, v30.16b				//CTR block 11
   2931 	add	v30.4s, v30.4s, v31.4s		//CTR block 11
   2932 .inst	0xce0669ce	//eor3 v14.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
   2933 
   2934 	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
   2935 
   2936 	rev32	v4.16b, v30.16b				//CTR block 12
   2937 	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
   2938 	add	v30.4s, v30.4s, v31.4s		//CTR block 12
   2939 
   2940 	b.ge	.L192_enc_prepretail					//do prepretail
   2941 
   2942 .L192_enc_main_loop:	//main	loop start
   2943 	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
   2944 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   2945 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   2946 
   2947 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   2948 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   2949 	ldr	q23, [x3, #176]				//load h7l | h7h
   2950 	ext	v23.16b, v23.16b, v23.16b, #8
   2951 	ldr	q25, [x3, #208]				//load h8l | h8h
   2952 	ext	v25.16b, v25.16b, v25.16b, #8
   2953 
   2954 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   2955 	rev64	v8.16b, v8.16b						//GHASH block 8k
   2956 	ldr	q20, [x3, #128]				//load h5l | h5h
   2957 	ext	v20.16b, v20.16b, v20.16b, #8
   2958 	ldr	q22, [x3, #160]				//load h6l | h6h
   2959 	ext	v22.16b, v22.16b, v22.16b, #8
   2960 
   2961 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   2962 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   2963 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   2964 
   2965 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   2966 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   2967 	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
   2968 
   2969 	aese	v0.16b, v26.16b
   2970 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   2971 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   2972 	aese	v1.16b, v26.16b
   2973 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   2974 
   2975 	aese	v3.16b, v26.16b
   2976 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   2977 	aese	v5.16b, v26.16b
   2978 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   2979 	aese	v2.16b, v26.16b
   2980 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   2981 
   2982 	aese	v7.16b, v26.16b
   2983 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   2984 	aese	v4.16b, v26.16b
   2985 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   2986 	aese	v6.16b, v26.16b
   2987 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   2988 
   2989 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   2990 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   2991 	aese	v0.16b, v27.16b
   2992 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   2993 
   2994 	aese	v4.16b, v27.16b
   2995 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   2996 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   2997 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   2998 
   2999 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   3000 	aese	v3.16b, v27.16b
   3001 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   3002 	ldr	q21, [x3, #144]				//load h6k | h5k
   3003 	ldr	q24, [x3, #192]				//load h8k | h7k
   3004 
   3005 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   3006 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   3007 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   3008 
   3009 	aese	v1.16b, v27.16b
   3010 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   3011 	aese	v2.16b, v27.16b
   3012 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   3013 	aese	v5.16b, v27.16b
   3014 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   3015 
   3016 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   3017 	aese	v6.16b, v27.16b
   3018 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   3019 	aese	v7.16b, v27.16b
   3020 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   3021 
   3022 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   3023 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   3024 	aese	v1.16b, v28.16b
   3025 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   3026 
   3027 	aese	v3.16b, v28.16b
   3028 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   3029 	aese	v4.16b, v28.16b
   3030 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   3031 	aese	v6.16b, v28.16b
   3032 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   3033 
   3034 	aese	v5.16b, v28.16b
   3035 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   3036 	aese	v1.16b, v26.16b
   3037 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   3038 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   3039 
   3040 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   3041 	aese	v7.16b, v28.16b
   3042 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   3043 	aese	v4.16b, v26.16b
   3044 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   3045 
   3046 	aese	v2.16b, v28.16b
   3047 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   3048 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   3049 	aese	v0.16b, v28.16b
   3050 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   3051 
   3052 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   3053 	aese	v3.16b, v26.16b
   3054 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   3055 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   3056 
   3057 	aese	v0.16b, v26.16b
   3058 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   3059 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   3060 	ldr	q23, [x3, #80]				//load h3l | h3h
   3061 	ext	v23.16b, v23.16b, v23.16b, #8
   3062 	ldr	q25, [x3, #112]				//load h4l | h4h
   3063 	ext	v25.16b, v25.16b, v25.16b, #8
   3064 
   3065 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k - mid
   3066 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   3067 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   3068 
   3069 	aese	v5.16b, v26.16b
   3070 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   3071 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   3072 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   3073 
   3074 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   3075 	aese	v6.16b, v26.16b
   3076 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   3077 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   3078 
   3079 	aese	v1.16b, v27.16b
   3080 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   3081 	aese	v3.16b, v27.16b
   3082 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   3083 	aese	v7.16b, v26.16b
   3084 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   3085 
   3086 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   3087 	aese	v6.16b, v27.16b
   3088 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   3089 	aese	v2.16b, v26.16b
   3090 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   3091 
   3092 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   3093 	aese	v0.16b, v27.16b
   3094 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   3095 	aese	v4.16b, v27.16b
   3096 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   3097 
   3098 	aese	v2.16b, v27.16b
   3099 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   3100 	aese	v5.16b, v27.16b
   3101 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   3102 	aese	v7.16b, v27.16b
   3103 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   3104 
   3105 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   3106 	aese	v4.16b, v28.16b
   3107 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   3108 	ldr	q20, [x3, #32]				//load h1l | h1h
   3109 	ext	v20.16b, v20.16b, v20.16b, #8
   3110 	ldr	q22, [x3, #64]				//load h2l | h2h
   3111 	ext	v22.16b, v22.16b, v22.16b, #8
   3112 
   3113 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   3114 	aese	v2.16b, v28.16b
   3115 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   3116 	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
   3117 
   3118 	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
   3119 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   3120 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   3121 
   3122 	aese	v5.16b, v28.16b
   3123 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   3124 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   3125 
   3126 	aese	v6.16b, v28.16b
   3127 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   3128 	ldr	q21, [x3, #48]				//load h2k | h1k
   3129 	ldr	q24, [x3, #96]				//load h4k | h3k
   3130 
   3131 	aese	v1.16b, v28.16b
   3132 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   3133 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   3134 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   3135 
   3136 	aese	v3.16b, v28.16b
   3137 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   3138 	aese	v7.16b, v28.16b
   3139 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   3140 	aese	v0.16b, v28.16b
   3141 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   3142 
   3143 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   3144 	aese	v4.16b, v26.16b
   3145 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   3146 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   3147 
   3148 	aese	v0.16b, v26.16b
   3149 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   3150 	aese	v3.16b, v26.16b
   3151 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   3152 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   3153 
   3154 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   3155 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   3156 	aese	v2.16b, v26.16b
   3157 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   3158 
   3159 	aese	v6.16b, v26.16b
   3160 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   3161 	aese	v5.16b, v26.16b
   3162 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   3163 
   3164 	aese	v7.16b, v26.16b
   3165 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   3166 	aese	v2.16b, v27.16b
   3167 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   3168 	aese	v1.16b, v26.16b
   3169 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   3170 
   3171 	aese	v6.16b, v27.16b
   3172 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   3173 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   3174 
   3175 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   3176 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   3177 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   3178 
   3179 	aese	v4.16b, v27.16b
   3180 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   3181 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   3182 	aese	v5.16b, v27.16b
   3183 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   3184 
   3185 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   3186 	aese	v7.16b, v27.16b
   3187 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   3188 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   3189 
   3190 	ldr	d16, [x10]			//MODULO - load modulo constant
   3191 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   3192 	aese	v0.16b, v27.16b
   3193 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   3194 
   3195 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   3196 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   3197 	aese	v3.16b, v27.16b
   3198 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   3199 
   3200 	aese	v5.16b, v28.16b
   3201 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   3202 	aese	v4.16b, v28.16b
   3203 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   3204 	aese	v0.16b, v28.16b
   3205 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   3206 
   3207 	aese	v6.16b, v28.16b
   3208 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   3209 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   3210 	aese	v1.16b, v27.16b
   3211 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   3212 
   3213 	aese	v7.16b, v28.16b
   3214 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   3215 	aese	v2.16b, v28.16b
   3216 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   3217 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   3218 
   3219 	aese	v1.16b, v28.16b
   3220 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   3221 	aese	v3.16b, v28.16b
   3222 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   3223 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   3224 
   3225 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   3226 	rev32	v20.16b, v30.16b					//CTR block 8k+16
   3227 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
   3228 
   3229 	aese	v2.16b, v26.16b
   3230 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   3231 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   3232 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   3233 
   3234 	aese	v6.16b, v26.16b
   3235 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   3236 	aese	v3.16b, v26.16b
   3237 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   3238 	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
   3239 
   3240 	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   3241 	rev32	v22.16b, v30.16b					//CTR block 8k+17
   3242 	aese	v0.16b, v26.16b
   3243 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   3244 
   3245 	aese	v4.16b, v26.16b
   3246 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   3247 	aese	v1.16b, v26.16b
   3248 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   3249 	aese	v7.16b, v26.16b
   3250 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   3251 
   3252 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   3253 	aese	v5.16b, v26.16b
   3254 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   3255 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
   3256 
   3257 	aese	v2.16b, v27.16b
   3258 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   3259 	aese	v4.16b, v27.16b
   3260 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   3261 	ldr	q26, [x8, #192]					//load rk12
   3262 	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   3263 
   3264 	aese	v0.16b, v27.16b
   3265 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   3266 	aese	v7.16b, v27.16b
   3267 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   3268 	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
   3269 
   3270 	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
   3271 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   3272 	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load plaintext
   3273 
   3274 	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load plaintext
   3275 	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
   3276 	aese	v1.16b, v27.16b
   3277 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   3278 
   3279 	rev32	v23.16b, v30.16b					//CTR block 8k+18
   3280 	aese	v5.16b, v27.16b
   3281 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   3282 
   3283 	aese	v3.16b, v27.16b
   3284 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   3285 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   3286 
   3287 	aese	v6.16b, v27.16b
   3288 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   3289 	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
   3290 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
   3291 
   3292 	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
   3293 	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
   3294 .inst	0xce04698c	//eor3 v12.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result
   3295 
   3296 	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
   3297 	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
   3298 	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
   3299 
   3300 	rev32	v25.16b, v30.16b					//CTR block 8k+19
   3301 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
   3302 .inst	0xce0769ef	//eor3 v15.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
   3303 
   3304 .inst	0xce02694a	//eor3 v10.16b, v10.16b, v2.16b, v26.16b				//AES block 8k+10 - result
   3305 .inst	0xce006908	//eor3 v8.16b, v8.16b, v0.16b, v26.16b				//AES block 8k+8 - result
   3306 	mov	v2.16b, v23.16b					//CTR block 8k+18
   3307 
   3308 .inst	0xce016929	//eor3 v9.16b, v9.16b, v1.16b, v26.16b				//AES block 8k+9 - result
   3309 	mov	v1.16b, v22.16b					//CTR block 8k+17
   3310 	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
   3311 	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   3312 
   3313 .inst	0xce0669ce	//eor3 v14.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
   3314 	mov	v0.16b, v20.16b					//CTR block 8k+16
   3315 	rev32	v4.16b, v30.16b				//CTR block 8k+20
   3316 
   3317 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
   3318 .inst	0xce0569ad	//eor3 v13.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
   3319 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   3320 
   3321 .inst	0xce03696b	//eor3 v11.16b, v11.16b, v3.16b, v26.16b				//AES block 8k+11 - result
   3322 	mov	v3.16b, v25.16b					//CTR block 8k+19
   3323 
   3324 	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
   3325 
   3326 	stp	q12, q13, [x2], #32			//AES block 8k+12, 8k+13 - store result
   3327 
   3328 	cmp	x0, x5				//.LOOP CONTROL
   3329 	stp	q14, q15, [x2], #32			//AES block 8k+14, 8k+15 - store result
   3330 	b.lt	.L192_enc_main_loop
   3331 
   3332 .L192_enc_prepretail:	//PREPRETAIL
   3333 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   3334 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   3335 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   3336 
   3337 	ldr	q23, [x3, #176]				//load h7l | h7h
   3338 	ext	v23.16b, v23.16b, v23.16b, #8
   3339 	ldr	q25, [x3, #208]				//load h8l | h8h
   3340 	ext	v25.16b, v25.16b, v25.16b, #8
   3341 	rev64	v8.16b, v8.16b						//GHASH block 8k
   3342 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   3343 
   3344 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   3345 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   3346 	ldr	q21, [x3, #144]				//load h6k | h5k
   3347 	ldr	q24, [x3, #192]				//load h8k | h7k
   3348 
   3349 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   3350 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   3351 	ldr	q20, [x3, #128]				//load h5l | h5h
   3352 	ext	v20.16b, v20.16b, v20.16b, #8
   3353 	ldr	q22, [x3, #160]				//load h6l | h6h
   3354 	ext	v22.16b, v22.16b, v22.16b, #8
   3355 
   3356 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   3357 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   3358 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   3359 
   3360 	aese	v5.16b, v26.16b
   3361 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   3362 	aese	v2.16b, v26.16b
   3363 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   3364 	aese	v3.16b, v26.16b
   3365 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   3366 
   3367 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   3368 	aese	v0.16b, v26.16b
   3369 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   3370 	aese	v6.16b, v26.16b
   3371 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   3372 
   3373 	aese	v1.16b, v26.16b
   3374 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   3375 	aese	v4.16b, v26.16b
   3376 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   3377 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   3378 
   3379 	aese	v6.16b, v27.16b
   3380 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   3381 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   3382 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   3383 
   3384 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   3385 	aese	v7.16b, v26.16b
   3386 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   3387 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   3388 
   3389 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   3390 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   3391 	aese	v2.16b, v27.16b
   3392 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   3393 
   3394 	aese	v5.16b, v27.16b
   3395 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   3396 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   3397 	aese	v1.16b, v27.16b
   3398 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   3399 
   3400 	aese	v7.16b, v27.16b
   3401 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   3402 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   3403 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   3404 
   3405 	aese	v3.16b, v27.16b
   3406 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   3407 	aese	v0.16b, v27.16b
   3408 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   3409 	aese	v4.16b, v27.16b
   3410 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   3411 
   3412 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   3413 	aese	v5.16b, v28.16b
   3414 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   3415 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   3416 
   3417 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   3418 	aese	v7.16b, v28.16b
   3419 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   3420 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   3421 
   3422 	aese	v5.16b, v26.16b
   3423 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   3424 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   3425 	aese	v6.16b, v28.16b
   3426 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   3427 
   3428 	aese	v0.16b, v28.16b
   3429 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   3430 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   3431 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   3432 
   3433 	aese	v3.16b, v28.16b
   3434 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   3435 	rev64	v13.16b, v13.16b						//GHASH block 8k+5 (t0, t1, t2 and t3 free)
   3436 	rev64	v14.16b, v14.16b						//GHASH block 8k+6 (t0, t1, and t2 free)
   3437 
   3438 	aese	v2.16b, v28.16b
   3439 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   3440 	aese	v1.16b, v28.16b
   3441 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   3442 	aese	v4.16b, v28.16b
   3443 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   3444 
   3445 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   3446 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   3447 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   3448 
   3449 	aese	v1.16b, v26.16b
   3450 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   3451 	aese	v6.16b, v26.16b
   3452 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   3453 	aese	v2.16b, v26.16b
   3454 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   3455 
   3456 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   3457 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   3458 	aese	v7.16b, v26.16b
   3459 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   3460 
   3461 	ldr	q23, [x3, #80]				//load h3l | h3h
   3462 	ext	v23.16b, v23.16b, v23.16b, #8
   3463 	ldr	q25, [x3, #112]				//load h4l | h4h
   3464 	ext	v25.16b, v25.16b, v25.16b, #8
   3465 	aese	v3.16b, v26.16b
   3466 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   3467 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   3468 
   3469 	ldr	q20, [x3, #32]				//load h1l | h1h
   3470 	ext	v20.16b, v20.16b, v20.16b, #8
   3471 	ldr	q22, [x3, #64]				//load h2l | h2h
   3472 	ext	v22.16b, v22.16b, v22.16b, #8
   3473 	aese	v4.16b, v26.16b
   3474 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   3475 	rev64	v12.16b, v12.16b						//GHASH block 8k+4 (t0, t1, and t2 free)
   3476 
   3477 	aese	v0.16b, v26.16b
   3478 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   3479 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   3480 	aese	v6.16b, v27.16b
   3481 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   3482 
   3483 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   3484 	aese	v7.16b, v27.16b
   3485 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   3486 	aese	v5.16b, v27.16b
   3487 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   3488 
   3489 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   3490 	aese	v3.16b, v27.16b
   3491 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   3492 	aese	v0.16b, v27.16b
   3493 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   3494 
   3495 	aese	v1.16b, v27.16b
   3496 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   3497 	aese	v4.16b, v27.16b
   3498 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   3499 	aese	v2.16b, v27.16b
   3500 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   3501 
   3502 	aese	v0.16b, v28.16b
   3503 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   3504 	rev64	v15.16b, v15.16b						//GHASH block 8k+7 (t0, t1, t2 and t3 free)
   3505 	ldr	q21, [x3, #48]				//load h2k | h1k
   3506 	ldr	q24, [x3, #96]				//load h4k | h3k
   3507 
   3508 	aese	v1.16b, v28.16b
   3509 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   3510 	aese	v2.16b, v28.16b
   3511 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   3512 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   3513 
   3514 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   3515 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   3516 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   3517 
   3518 	aese	v4.16b, v28.16b
   3519 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   3520 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   3521 
   3522 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   3523 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   3524 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   3525 
   3526 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   3527 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   3528 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   3529 
   3530 	aese	v5.16b, v28.16b
   3531 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   3532 	aese	v1.16b, v26.16b
   3533 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   3534 	aese	v7.16b, v28.16b
   3535 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   3536 
   3537 	aese	v6.16b, v28.16b
   3538 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   3539 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   3540 	aese	v3.16b, v28.16b
   3541 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   3542 
   3543 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   3544 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   3545 
   3546 	aese	v4.16b, v26.16b
   3547 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   3548 	aese	v5.16b, v26.16b
   3549 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   3550 	aese	v1.16b, v27.16b
   3551 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   3552 
   3553 	aese	v0.16b, v26.16b
   3554 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   3555 	aese	v7.16b, v26.16b
   3556 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   3557 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   3558 
   3559 	aese	v2.16b, v26.16b
   3560 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   3561 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   3562 	aese	v5.16b, v27.16b
   3563 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   3564 
   3565 	aese	v6.16b, v26.16b
   3566 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   3567 	ldr	d16, [x10]			//MODULO - load modulo constant
   3568 	aese	v3.16b, v26.16b
   3569 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   3570 
   3571 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   3572 	aese	v0.16b, v27.16b
   3573 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   3574 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   3575 
   3576 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   3577 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   3578 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   3579 
   3580 	aese	v4.16b, v27.16b
   3581 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   3582 	aese	v2.16b, v27.16b
   3583 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   3584 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   3585 
   3586 	aese	v3.16b, v27.16b
   3587 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   3588 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   3589 
   3590 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   3591 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   3592 
   3593 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   3594 	ext	v29.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   3595 	aese	v7.16b, v27.16b
   3596 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   3597 	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   3598 
   3599 	aese	v5.16b, v28.16b
   3600 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   3601 	aese	v1.16b, v28.16b
   3602 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   3603 
   3604 	aese	v6.16b, v27.16b
   3605 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   3606 	aese	v2.16b, v28.16b
   3607 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   3608 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   3609 
   3610 	aese	v3.16b, v28.16b
   3611 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   3612 	aese	v5.16b, v26.16b
   3613 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   3614 	aese	v4.16b, v28.16b
   3615 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   3616 
   3617 	aese	v0.16b, v28.16b
   3618 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   3619 	aese	v7.16b, v28.16b
   3620 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   3621 	aese	v6.16b, v28.16b
   3622 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   3623 
   3624 	aese	v3.16b, v26.16b
   3625 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   3626 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   3627 	aese	v4.16b, v26.16b
   3628 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   3629 
   3630 	aese	v2.16b, v26.16b
   3631 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   3632 	aese	v7.16b, v26.16b
   3633 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   3634 
   3635 	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   3636 	aese	v6.16b, v26.16b
   3637 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   3638 	aese	v0.16b, v26.16b
   3639 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   3640 	aese	v1.16b, v26.16b
   3641 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   3642 
   3643 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   3644 	ldr	q26, [x8, #192]					//load rk12
   3645 
   3646 	aese	v7.16b, v27.16b
   3647 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   3648 	aese	v1.16b, v27.16b
   3649 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   3650 	aese	v2.16b, v27.16b
   3651 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   3652 
   3653 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   3654 	aese	v0.16b, v27.16b
   3655 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   3656 	aese	v3.16b, v27.16b
   3657 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   3658 
   3659 	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
   3660 	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
   3661 
   3662 	aese	v4.16b, v27.16b
   3663 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   3664 	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
   3665 
   3666 	aese	v5.16b, v27.16b
   3667 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   3668 	aese	v6.16b, v27.16b
   3669 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   3670 
   3671 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   3672 	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
   3673 	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
   3674 
   3675 	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
   3676 	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
   3677 	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
   3678 
   3679 .L192_enc_tail:	//TAIL
   3680 
   3681 	ldp	q20, q21, [x3, #128]			//load h5l | h5h
   3682 	ext	v20.16b, v20.16b, v20.16b, #8
   3683 	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
   3684 
   3685 	ldr	q8, [x0], #16				//AES block 8k+8 - l3ad plaintext
   3686 
   3687 	ldp	q24, q25, [x3, #192]			//load h8k | h7k
   3688 	ext	v25.16b, v25.16b, v25.16b, #8
   3689 
   3690 	mov	v29.16b, v26.16b
   3691 
   3692 	ldp	q22, q23, [x3, #160]			//load h6l | h6h
   3693 	ext	v22.16b, v22.16b, v22.16b, #8
   3694 	ext	v23.16b, v23.16b, v23.16b, #8
   3695 	cmp	x5, #112
   3696 
   3697 .inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b			//AES block 8k+8 - result
   3698 	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
   3699 	b.gt	.L192_enc_blocks_more_than_7
   3700 
   3701 	cmp	x5, #96
   3702 	mov	v7.16b, v6.16b
   3703 	movi	v17.8b, #0
   3704 
   3705 	mov	v6.16b, v5.16b
   3706 	movi	v19.8b, #0
   3707 	sub	v30.4s, v30.4s, v31.4s
   3708 
   3709 	mov	v5.16b, v4.16b
   3710 	mov	v4.16b, v3.16b
   3711 	mov	v3.16b, v2.16b
   3712 
   3713 	mov	v2.16b, v1.16b
   3714 	movi	v18.8b, #0
   3715 	b.gt	.L192_enc_blocks_more_than_6
   3716 
   3717 	mov	v7.16b, v6.16b
   3718 	cmp	x5, #80
   3719 
   3720 	mov	v6.16b, v5.16b
   3721 	mov	v5.16b, v4.16b
   3722 	mov	v4.16b, v3.16b
   3723 
   3724 	mov	v3.16b, v1.16b
   3725 	sub	v30.4s, v30.4s, v31.4s
   3726 	b.gt	.L192_enc_blocks_more_than_5
   3727 
   3728 	cmp	x5, #64
   3729 	sub	v30.4s, v30.4s, v31.4s
   3730 
   3731 	mov	v7.16b, v6.16b
   3732 	mov	v6.16b, v5.16b
   3733 	mov	v5.16b, v4.16b
   3734 
   3735 	mov	v4.16b, v1.16b
   3736 	b.gt	.L192_enc_blocks_more_than_4
   3737 
   3738 	mov	v7.16b, v6.16b
   3739 	mov	v6.16b, v5.16b
   3740 	mov	v5.16b, v1.16b
   3741 
   3742 	sub	v30.4s, v30.4s, v31.4s
   3743 	cmp	x5, #48
   3744 	b.gt	.L192_enc_blocks_more_than_3
   3745 
   3746 	mov	v7.16b, v6.16b
   3747 	mov	v6.16b, v1.16b
   3748 	sub	v30.4s, v30.4s, v31.4s
   3749 
   3750 	ldr	q24, [x3, #96]				//load h4k | h3k
   3751 	cmp	x5, #32
   3752 	b.gt	.L192_enc_blocks_more_than_2
   3753 
   3754 	sub	v30.4s, v30.4s, v31.4s
   3755 
   3756 	cmp	x5, #16
   3757 	mov	v7.16b, v1.16b
   3758 	b.gt	.L192_enc_blocks_more_than_1
   3759 
   3760 	sub	v30.4s, v30.4s, v31.4s
   3761 	ldr	q21, [x3, #48]				//load h2k | h1k
   3762 	b	.L192_enc_blocks_less_than_1
   3763 .L192_enc_blocks_more_than_7:	//blocks	left >  7
   3764 	st1	{ v9.16b}, [x2], #16			 	//AES final-7 block  - store result
   3765 
   3766 	rev64	v8.16b, v9.16b						//GHASH final-7 block
   3767 	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
   3768 
   3769 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3770 
   3771 	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
   3772 
   3773 	ldr	q9, [x0], #16				//AES final-6 block - load plaintext
   3774 
   3775 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
   3776 	movi	v16.8b, #0						//suppress further partial tag feed in
   3777 	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
   3778 
   3779 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
   3780 
   3781 	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
   3782 .inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
   3783 .L192_enc_blocks_more_than_6:	//blocks	left >  6
   3784 
   3785 	st1	{ v9.16b}, [x2], #16			 	//AES final-6 block - store result
   3786 
   3787 	rev64	v8.16b, v9.16b						//GHASH final-6 block
   3788 
   3789 	ldr	q9, [x0], #16				//AES final-5 block - load plaintext
   3790 
   3791 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3792 
   3793 	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
   3794 
   3795 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
   3796 .inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
   3797 
   3798 	movi	v16.8b, #0						//suppress further partial tag feed in
   3799 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
   3800 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
   3801 
   3802 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
   3803 
   3804 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
   3805 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
   3806 
   3807 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
   3808 .L192_enc_blocks_more_than_5:	//blocks	left >  5
   3809 
   3810 	st1	{ v9.16b}, [x2], #16			 	//AES final-5 block - store result
   3811 
   3812 	rev64	v8.16b, v9.16b						//GHASH final-5 block
   3813 
   3814 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3815 
   3816 	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
   3817 
   3818 	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
   3819 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
   3820 
   3821 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
   3822 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
   3823 
   3824 	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
   3825 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
   3826 
   3827 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
   3828 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
   3829 
   3830 .inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
   3831 	movi	v16.8b, #0						//suppress further partial tag feed in
   3832 
   3833 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
   3834 .L192_enc_blocks_more_than_4:	//blocks	left >  4
   3835 
   3836 	st1	{ v9.16b}, [x2], #16				//AES final-4 block - store result
   3837 
   3838 	rev64	v8.16b, v9.16b						//GHASH final-4 block
   3839 
   3840 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3841 
   3842 	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
   3843 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
   3844 	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
   3845 
   3846 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
   3847 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
   3848 
   3849 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
   3850 
   3851 	movi	v16.8b, #0						//suppress further partial tag feed in
   3852 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
   3853 
   3854 	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
   3855 
   3856 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
   3857 .inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
   3858 .L192_enc_blocks_more_than_3:	//blocks	left >  3
   3859 
   3860 	ldr	q24, [x3, #96]				//load h4k | h3k
   3861 	st1	{ v9.16b}, [x2], #16			 	//AES final-3 block - store result
   3862 
   3863 	rev64	v8.16b, v9.16b						//GHASH final-3 block
   3864 
   3865 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3866 	movi	v16.8b, #0						//suppress further partial tag feed in
   3867 
   3868 	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
   3869 	ldr	q25, [x3, #112]				//load h4l | h4h
   3870 	ext	v25.16b, v25.16b, v25.16b, #8
   3871 
   3872 	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
   3873 
   3874 .inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
   3875 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
   3876 
   3877 	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
   3878 	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
   3879 
   3880 	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
   3881 	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
   3882 
   3883 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
   3884 
   3885 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
   3886 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
   3887 .L192_enc_blocks_more_than_2:	//blocks	left >  2
   3888 
   3889 	st1	{ v9.16b}, [x2], #16			 	//AES final-2 block - store result
   3890 
   3891 	rev64	v8.16b, v9.16b						//GHASH final-2 block
   3892 	ldr	q23, [x3, #80]				//load h3l | h3h
   3893 	ext	v23.16b, v23.16b, v23.16b, #8
   3894 
   3895 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3896 
   3897 	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
   3898 	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
   3899 
   3900 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
   3901 
   3902 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
   3903 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
   3904 	movi	v16.8b, #0						//suppress further partial tag feed in
   3905 
   3906 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
   3907 
   3908 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
   3909 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
   3910 
   3911 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
   3912 .inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
   3913 .L192_enc_blocks_more_than_1:	//blocks	left >  1
   3914 
   3915 	ldr	q22, [x3, #64]				//load h1l | h1h
   3916 	ext	v22.16b, v22.16b, v22.16b, #8
   3917 	st1	{ v9.16b}, [x2], #16			 	//AES final-1 block - store result
   3918 
   3919 	rev64	v8.16b, v9.16b						//GHASH final-1 block
   3920 
   3921 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3922 
   3923 	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
   3924 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
   3925 
   3926 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
   3927 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
   3928 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
   3929 
   3930 	ldr	q9, [x0], #16				//AES final block - load plaintext
   3931 	ldr	q21, [x3, #48]				//load h2k | h1k
   3932 
   3933 	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
   3934 
   3935 .inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
   3936 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
   3937 
   3938 	movi	v16.8b, #0						//suppress further partial tag feed in
   3939 
   3940 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
   3941 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
   3942 .L192_enc_blocks_less_than_1:	//blocks	left <= 1
   3943 
   3944 	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
   3945 	and	x1, x1, #127				//bit_length %= 128
   3946 
   3947 	sub	x1, x1, #128				//bit_length -= 128
   3948 
   3949 	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
   3950 
   3951 	and	x1, x1, #127				//bit_length %= 128
   3952 
   3953 	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
   3954 	cmp	x1, #64
   3955 	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
   3956 
   3957 	csel	x13, x7, x6, lt
   3958 	csel	x14, x6, xzr, lt
   3959 
   3960 	mov	v0.d[1], x14
   3961 	ldr	q20, [x3, #32]				//load h1l | h1h
   3962 	ext	v20.16b, v20.16b, v20.16b, #8
   3963 
   3964 	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
   3965 	mov	v0.d[0], x13					//ctr0b is mask for last block
   3966 
   3967 	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
   3968 
   3969 	rev64	v8.16b, v9.16b						//GHASH final block
   3970 	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
   3971 
   3972 	st1	{ v9.16b}, [x2]				//store all 16B
   3973 
   3974 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   3975 
   3976 	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
   3977 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
   3978 
   3979 	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
   3980 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
   3981 
   3982 	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
   3983 
   3984 	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
   3985 
   3986 	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
   3987 	ldr	d16, [x10]			//MODULO - load modulo constant
   3988 
   3989 	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
   3990 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   3991 
   3992 	rev32	v30.16b, v30.16b
   3993 
   3994 	str	q30, [x16]					//store the updated counter
   3995 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   3996 
   3997 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   3998 
   3999 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   4000 
   4001 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   4002 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   4003 
   4004 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   4005 	ext	v19.16b, v19.16b, v19.16b, #8
   4006 	rev64	v19.16b, v19.16b
   4007 	st1	{ v19.16b }, [x3]
   4008 
   4009 	mov	x0, x9					//return sizes
   4010 
   4011 	ldp	d10, d11, [sp, #16]
   4012 	ldp	d12, d13, [sp, #32]
   4013 	ldp	d14, d15, [sp, #48]
   4014 	ldp	d8, d9, [sp], #80
   4015 	ret
   4016 
   4017 .L192_enc_ret:
   4018 	mov	w0, #0x0
   4019 	ret
   4020 .size	unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
   4021 .globl	unroll8_eor3_aes_gcm_dec_192_kernel
   4022 .type	unroll8_eor3_aes_gcm_dec_192_kernel,%function
   4023 .align	4
   4024 unroll8_eor3_aes_gcm_dec_192_kernel:
   4025 	AARCH64_VALID_CALL_TARGET
   4026 	cbz	x1, .L192_dec_ret
   4027 	stp	d8, d9, [sp, #-80]!
   4028 	lsr	x9, x1, #3
   4029 	mov	x16, x4
   4030 	mov	x8, x5
   4031 	stp	d10, d11, [sp, #16]
   4032 	stp	d12, d13, [sp, #32]
   4033 	stp	d14, d15, [sp, #48]
   4034 	mov	x5, #0xc200000000000000
   4035 	stp	x5, xzr, [sp, #64]
   4036 	add	x10, sp, #64
   4037 
   4038 	mov	x5, x9
   4039 	ld1	{ v0.16b}, [x16]					//CTR block 0
   4040 	ld1	{ v19.16b}, [x3]
   4041 
   4042 	mov	x15, #0x100000000			//set up counter increment
   4043 	movi	v31.16b, #0x0
   4044 	mov	v31.d[1], x15
   4045 
   4046 	rev32	v30.16b, v0.16b				//set up reversed counter
   4047 
   4048 	add	v30.4s, v30.4s, v31.4s		//CTR block 0
   4049 
   4050 	rev32	v1.16b, v30.16b				//CTR block 1
   4051 	add	v30.4s, v30.4s, v31.4s		//CTR block 1
   4052 
   4053 	rev32	v2.16b, v30.16b				//CTR block 2
   4054 	add	v30.4s, v30.4s, v31.4s		//CTR block 2
   4055 
   4056 	rev32	v3.16b, v30.16b				//CTR block 3
   4057 	add	v30.4s, v30.4s, v31.4s		//CTR block 3
   4058 
   4059 	rev32	v4.16b, v30.16b				//CTR block 4
   4060 	add	v30.4s, v30.4s, v31.4s		//CTR block 4
   4061 
   4062 	rev32	v5.16b, v30.16b				//CTR block 5
   4063 	add	v30.4s, v30.4s, v31.4s		//CTR block 5
   4064 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   4065 
   4066 	rev32	v6.16b, v30.16b				//CTR block 6
   4067 	add	v30.4s, v30.4s, v31.4s		//CTR block 6
   4068 
   4069 	rev32	v7.16b, v30.16b				//CTR block 7
   4070 
   4071 	aese	v3.16b, v26.16b
   4072 	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
   4073 	aese	v6.16b, v26.16b
   4074 	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
   4075 	aese	v5.16b, v26.16b
   4076 	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
   4077 
   4078 	aese	v0.16b, v26.16b
   4079 	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
   4080 	aese	v1.16b, v26.16b
   4081 	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
   4082 	aese	v7.16b, v26.16b
   4083 	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
   4084 
   4085 	aese	v2.16b, v26.16b
   4086 	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
   4087 	aese	v4.16b, v26.16b
   4088 	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
   4089 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   4090 
   4091 	aese	v1.16b, v27.16b
   4092 	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
   4093 
   4094 	aese	v2.16b, v27.16b
   4095 	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
   4096 
   4097 	aese	v0.16b, v27.16b
   4098 	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
   4099 	aese	v3.16b, v27.16b
   4100 	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
   4101 	aese	v7.16b, v27.16b
   4102 	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
   4103 
   4104 	aese	v5.16b, v27.16b
   4105 	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
   4106 	aese	v6.16b, v27.16b
   4107 	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
   4108 
   4109 	aese	v7.16b, v28.16b
   4110 	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
   4111 	aese	v0.16b, v28.16b
   4112 	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
   4113 	aese	v4.16b, v27.16b
   4114 	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
   4115 
   4116 	aese	v5.16b, v28.16b
   4117 	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
   4118 	aese	v1.16b, v28.16b
   4119 	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
   4120 	aese	v2.16b, v28.16b
   4121 	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
   4122 
   4123 	aese	v3.16b, v28.16b
   4124 	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
   4125 	aese	v4.16b, v28.16b
   4126 	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
   4127 	aese	v6.16b, v28.16b
   4128 	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
   4129 
   4130 	aese	v7.16b, v26.16b
   4131 	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
   4132 
   4133 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   4134 	aese	v2.16b, v26.16b
   4135 	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
   4136 	aese	v5.16b, v26.16b
   4137 	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
   4138 
   4139 	aese	v0.16b, v26.16b
   4140 	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
   4141 	aese	v3.16b, v26.16b
   4142 	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
   4143 
   4144 	aese	v4.16b, v26.16b
   4145 	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
   4146 	aese	v1.16b, v26.16b
   4147 	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
   4148 	aese	v6.16b, v26.16b
   4149 	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
   4150 
   4151 	aese	v3.16b, v27.16b
   4152 	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
   4153 	aese	v2.16b, v27.16b
   4154 	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
   4155 	aese	v5.16b, v27.16b
   4156 	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
   4157 
   4158 	aese	v1.16b, v27.16b
   4159 	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
   4160 	aese	v7.16b, v27.16b
   4161 	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
   4162 	aese	v6.16b, v27.16b
   4163 	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
   4164 
   4165 	aese	v0.16b, v27.16b
   4166 	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
   4167 	aese	v5.16b, v28.16b
   4168 	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
   4169 	aese	v4.16b, v27.16b
   4170 	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
   4171 
   4172 	aese	v6.16b, v28.16b
   4173 	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
   4174 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   4175 
   4176 	aese	v0.16b, v28.16b
   4177 	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
   4178 	aese	v4.16b, v28.16b
   4179 	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
   4180 	aese	v1.16b, v28.16b
   4181 	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
   4182 
   4183 	aese	v3.16b, v28.16b
   4184 	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
   4185 	aese	v2.16b, v28.16b
   4186 	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
   4187 	aese	v7.16b, v28.16b
   4188 	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
   4189 
   4190 	sub	x5, x5, #1		//byte_len - 1
   4191 
   4192 	aese	v4.16b, v26.16b
   4193 	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
   4194 	aese	v5.16b, v26.16b
   4195 	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
   4196 	aese	v1.16b, v26.16b
   4197 	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
   4198 
   4199 	aese	v0.16b, v26.16b
   4200 	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
   4201 	aese	v3.16b, v26.16b
   4202 	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
   4203 	aese	v6.16b, v26.16b
   4204 	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
   4205 
   4206 	aese	v7.16b, v26.16b
   4207 	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
   4208 	aese	v2.16b, v26.16b
   4209 	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
   4210 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   4211 
   4212 	add	v30.4s, v30.4s, v31.4s		//CTR block 7
   4213 
   4214 	aese	v3.16b, v27.16b
   4215 	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
   4216 	aese	v7.16b, v27.16b
   4217 	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
   4218 
   4219 	aese	v2.16b, v27.16b
   4220 	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
   4221 	aese	v1.16b, v27.16b
   4222 	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
   4223 	aese	v4.16b, v27.16b
   4224 	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
   4225 
   4226 	aese	v6.16b, v27.16b
   4227 	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
   4228 	aese	v0.16b, v27.16b
   4229 	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
   4230 	aese	v5.16b, v27.16b
   4231 	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
   4232 
   4233 	aese	v1.16b, v28.16b
   4234 	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
   4235 	aese	v2.16b, v28.16b
   4236 	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
   4237 	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   4238 
   4239 	aese	v7.16b, v28.16b
   4240 	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
   4241 	aese	v6.16b, v28.16b
   4242 	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
   4243 	aese	v5.16b, v28.16b
   4244 	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
   4245 
   4246 	aese	v4.16b, v28.16b
   4247 	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
   4248 	aese	v3.16b, v28.16b
   4249 	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
   4250 	aese	v0.16b, v28.16b
   4251 	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
   4252 
   4253 	add	x4, x0, x1, lsr #3		//end_input_ptr
   4254 	aese	v6.16b, v26.16b
   4255 	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
   4256 
   4257 	ld1	{ v19.16b}, [x3]
   4258 	ext	v19.16b, v19.16b, v19.16b, #8
   4259 	rev64	v19.16b, v19.16b
   4260 
   4261 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   4262 
   4263 	aese	v0.16b, v26.16b
   4264 	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
   4265 	add	x5, x5, x0
   4266 
   4267 	aese	v1.16b, v26.16b
   4268 	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
   4269 	aese	v7.16b, v26.16b
   4270 	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
   4271 	aese	v4.16b, v26.16b
   4272 	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
   4273 
   4274 	cmp	x0, x5				//check if we have <= 8 blocks
   4275 	aese	v3.16b, v26.16b
   4276 	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
   4277 
   4278 	aese	v5.16b, v26.16b
   4279 	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
   4280 	aese	v2.16b, v26.16b
   4281 	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
   4282 
   4283 	aese	v3.16b, v27.16b
   4284 	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
   4285 	aese	v1.16b, v27.16b
   4286 	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
   4287 	aese	v7.16b, v27.16b
   4288 	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
   4289 
   4290 	aese	v4.16b, v27.16b
   4291 	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
   4292 	aese	v0.16b, v27.16b
   4293 	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
   4294 	aese	v2.16b, v27.16b
   4295 	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
   4296 
   4297 	aese	v6.16b, v27.16b
   4298 	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
   4299 	aese	v5.16b, v27.16b
   4300 	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
   4301 	ldr	q26, [x8, #192]					//load rk12
   4302 
   4303 	aese	v0.16b, v28.16b						//AES block 0 - round 11
   4304 	aese	v1.16b, v28.16b						//AES block 1 - round 11
   4305 	aese	v4.16b, v28.16b						//AES block 4 - round 11
   4306 
   4307 	aese	v6.16b, v28.16b						//AES block 6 - round 11
   4308 	aese	v5.16b, v28.16b						//AES block 5 - round 11
   4309 	aese	v7.16b, v28.16b						//AES block 7 - round 11
   4310 
   4311 	aese	v2.16b, v28.16b						//AES block 2 - round 11
   4312 	aese	v3.16b, v28.16b						//AES block 3 - round 11
   4313 	b.ge	.L192_dec_tail						//handle tail
   4314 
   4315 	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext
   4316 
   4317 	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext
   4318 
   4319 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext
   4320 
   4321 .inst	0xce016921	//eor3 v1.16b, v9.16b, v1.16b, v26.16b				//AES block 1 - result
   4322 .inst	0xce006900	//eor3 v0.16b, v8.16b, v0.16b, v26.16b				//AES block 0 - result
   4323 	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result
   4324 
   4325 	rev32	v0.16b, v30.16b				//CTR block 8
   4326 	add	v30.4s, v30.4s, v31.4s		//CTR block 8
   4327 
   4328 	rev32	v1.16b, v30.16b				//CTR block 9
   4329 	add	v30.4s, v30.4s, v31.4s		//CTR block 9
   4330 .inst	0xce036963	//eor3 v3.16b, v11.16b, v3.16b, v26.16b				//AES block 3 - result
   4331 
   4332 .inst	0xce026942	//eor3 v2.16b, v10.16b, v2.16b, v26.16b				//AES block 2 - result
   4333 	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
   4334 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
   4335 
   4336 	rev32	v2.16b, v30.16b				//CTR block 10
   4337 	add	v30.4s, v30.4s, v31.4s		//CTR block 10
   4338 
   4339 .inst	0xce046984	//eor3 v4.16b, v12.16b, v4.16b, v26.16b				//AES block 4 - result
   4340 
   4341 	rev32	v3.16b, v30.16b				//CTR block 11
   4342 	add	v30.4s, v30.4s, v31.4s		//CTR block 11
   4343 
   4344 .inst	0xce0569a5	//eor3 v5.16b, v13.16b, v5.16b, v26.16b				//AES block 5 - result
   4345 	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
   4346 	cmp	x0, x5				//check if we have <= 8 blocks
   4347 
   4348 .inst	0xce0669c6	//eor3 v6.16b, v14.16b, v6.16b, v26.16b				//AES block 6 - result
   4349 .inst	0xce0769e7	//eor3 v7.16b, v15.16b, v7.16b, v26.16b				//AES block 7 - result
   4350 	rev32	v4.16b, v30.16b				//CTR block 12
   4351 
   4352 	add	v30.4s, v30.4s, v31.4s		//CTR block 12
   4353 	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
   4354 	b.ge	.L192_dec_prepretail					//do prepretail
   4355 
   4356 .L192_dec_main_loop:	//main	loop start
   4357 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   4358 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   4359 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   4360 
   4361 	rev64	v8.16b, v8.16b						//GHASH block 8k
   4362 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   4363 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   4364 
   4365 	ldr	q23, [x3, #176]				//load h7l | h7h
   4366 	ext	v23.16b, v23.16b, v23.16b, #8
   4367 	ldr	q25, [x3, #208]				//load h8l | h8h
   4368 	ext	v25.16b, v25.16b, v25.16b, #8
   4369 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   4370 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   4371 
   4372 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   4373 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   4374 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   4375 
   4376 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   4377 
   4378 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   4379 	aese	v1.16b, v26.16b
   4380 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   4381 	aese	v6.16b, v26.16b
   4382 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   4383 
   4384 	aese	v5.16b, v26.16b
   4385 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   4386 	aese	v4.16b, v26.16b
   4387 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   4388 	aese	v0.16b, v26.16b
   4389 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   4390 
   4391 	aese	v7.16b, v26.16b
   4392 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   4393 	aese	v2.16b, v26.16b
   4394 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   4395 	aese	v3.16b, v26.16b
   4396 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   4397 
   4398 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   4399 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   4400 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   4401 
   4402 	aese	v6.16b, v27.16b
   4403 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   4404 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   4405 	ldr	q20, [x3, #128]				//load h5l | h5h
   4406 	ext	v20.16b, v20.16b, v20.16b, #8
   4407 	ldr	q22, [x3, #160]				//load h6l | h6h
   4408 	ext	v22.16b, v22.16b, v22.16b, #8
   4409 
   4410 	aese	v0.16b, v27.16b
   4411 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   4412 	aese	v3.16b, v27.16b
   4413 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   4414 	aese	v7.16b, v27.16b
   4415 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   4416 
   4417 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   4418 	aese	v2.16b, v27.16b
   4419 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   4420 	aese	v4.16b, v27.16b
   4421 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   4422 
   4423 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   4424 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   4425 	aese	v1.16b, v27.16b
   4426 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   4427 
   4428 	aese	v5.16b, v27.16b
   4429 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   4430 	ldr	q21, [x3, #144]				//load h6k | h5k
   4431 	ldr	q24, [x3, #192]				//load h8k | h7k
   4432 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   4433 
   4434 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   4435 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   4436 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   4437 
   4438 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   4439 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   4440 	aese	v6.16b, v28.16b
   4441 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   4442 
   4443 	aese	v2.16b, v28.16b
   4444 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   4445 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   4446 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   4447 
   4448 	aese	v1.16b, v28.16b
   4449 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   4450 	aese	v6.16b, v26.16b
   4451 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   4452 	aese	v4.16b, v28.16b
   4453 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   4454 
   4455 	aese	v0.16b, v28.16b
   4456 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   4457 	aese	v7.16b, v28.16b
   4458 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   4459 	aese	v3.16b, v28.16b
   4460 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   4461 
   4462 	ldr	q23, [x3, #80]				//load h3l | h3h
   4463 	ext	v23.16b, v23.16b, v23.16b, #8
   4464 	ldr	q25, [x3, #112]				//load h4l | h4h
   4465 	ext	v25.16b, v25.16b, v25.16b, #8
   4466 	aese	v5.16b, v28.16b
   4467 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   4468 	aese	v2.16b, v26.16b
   4469 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   4470 
   4471 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   4472 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   4473 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   4474 
   4475 	aese	v3.16b, v26.16b
   4476 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   4477 	aese	v4.16b, v26.16b
   4478 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   4479 
   4480 	aese	v0.16b, v26.16b
   4481 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   4482 	aese	v7.16b, v26.16b
   4483 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   4484 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   4485 
   4486 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   4487 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   4488 	aese	v1.16b, v26.16b
   4489 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   4490 
   4491 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   4492 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   4493 
   4494 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   4495 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   4496 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   4497 
   4498 	aese	v5.16b, v26.16b
   4499 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   4500 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   4501 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   4502 
   4503 	aese	v4.16b, v27.16b
   4504 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   4505 	aese	v6.16b, v27.16b
   4506 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   4507 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   4508 
   4509 	aese	v5.16b, v27.16b
   4510 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   4511 	aese	v1.16b, v27.16b
   4512 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   4513 	aese	v3.16b, v27.16b
   4514 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   4515 
   4516 	aese	v2.16b, v27.16b
   4517 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   4518 	aese	v0.16b, v27.16b
   4519 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   4520 	aese	v7.16b, v27.16b
   4521 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   4522 
   4523 	ldr	q20, [x3, #32]				//load h1l | h1h
   4524 	ext	v20.16b, v20.16b, v20.16b, #8
   4525 	ldr	q22, [x3, #64]				//load h2l | h2h
   4526 	ext	v22.16b, v22.16b, v22.16b, #8
   4527 	aese	v3.16b, v28.16b
   4528 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   4529 	aese	v5.16b, v28.16b
   4530 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   4531 
   4532 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   4533 	aese	v7.16b, v28.16b
   4534 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   4535 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   4536 
   4537 	aese	v4.16b, v28.16b
   4538 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   4539 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   4540 	aese	v1.16b, v28.16b
   4541 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   4542 
   4543 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   4544 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   4545 	aese	v2.16b, v28.16b
   4546 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   4547 
   4548 	aese	v6.16b, v28.16b
   4549 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   4550 	aese	v0.16b, v28.16b
   4551 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   4552 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   4553 
   4554 	ldr	q21, [x3, #48]				//load h2k | h1k
   4555 	ldr	q24, [x3, #96]				//load h4k | h3k
   4556 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   4557 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   4558 
   4559 	aese	v0.16b, v26.16b
   4560 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   4561 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   4562 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   4563 
   4564 	aese	v7.16b, v26.16b
   4565 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   4566 	aese	v2.16b, v26.16b
   4567 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   4568 	aese	v6.16b, v26.16b
   4569 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   4570 
   4571 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   4572 	aese	v3.16b, v26.16b
   4573 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   4574 	aese	v1.16b, v26.16b
   4575 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   4576 
   4577 	aese	v2.16b, v27.16b
   4578 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   4579 	aese	v6.16b, v27.16b
   4580 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   4581 	aese	v5.16b, v26.16b
   4582 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   4583 
   4584 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   4585 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   4586 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   4587 
   4588 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   4589 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   4590 	aese	v4.16b, v26.16b
   4591 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   4592 
   4593 	aese	v5.16b, v27.16b
   4594 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   4595 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   4596 	aese	v3.16b, v27.16b
   4597 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   4598 
   4599 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   4600 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   4601 	aese	v1.16b, v27.16b
   4602 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   4603 
   4604 	aese	v4.16b, v27.16b
   4605 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   4606 	aese	v0.16b, v27.16b
   4607 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   4608 	aese	v7.16b, v27.16b
   4609 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   4610 
   4611 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   4612 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   4613 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   4614 
   4615 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   4616 	ldr	d16, [x10]			//MODULO - load modulo constant
   4617 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   4618 
   4619 	aese	v2.16b, v28.16b
   4620 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   4621 	aese	v5.16b, v28.16b
   4622 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   4623 	aese	v7.16b, v28.16b
   4624 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   4625 
   4626 	aese	v0.16b, v28.16b
   4627 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   4628 	aese	v3.16b, v28.16b
   4629 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   4630 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   4631 
   4632 	aese	v4.16b, v28.16b
   4633 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   4634 	aese	v1.16b, v28.16b
   4635 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   4636 	aese	v6.16b, v28.16b
   4637 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   4638 
   4639 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   4640 	rev32	v20.16b, v30.16b					//CTR block 8k+16
   4641 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
   4642 
   4643 	aese	v5.16b, v26.16b
   4644 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   4645 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   4646 	aese	v1.16b, v26.16b
   4647 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   4648 
   4649 	aese	v3.16b, v26.16b
   4650 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   4651 	aese	v7.16b, v26.16b
   4652 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   4653 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   4654 
   4655 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   4656 	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
   4657 
   4658 	aese	v2.16b, v26.16b
   4659 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   4660 	aese	v0.16b, v26.16b
   4661 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   4662 	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
   4663 
   4664 	rev32	v22.16b, v30.16b					//CTR block 8k+17
   4665 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   4666 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
   4667 
   4668 	aese	v6.16b, v26.16b
   4669 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   4670 	aese	v4.16b, v26.16b
   4671 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   4672 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   4673 
   4674 	aese	v3.16b, v27.16b
   4675 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   4676 	aese	v7.16b, v27.16b
   4677 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   4678 	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
   4679 
   4680 	rev32	v23.16b, v30.16b					//CTR block 8k+18
   4681 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
   4682 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   4683 
   4684 	aese	v0.16b, v27.16b
   4685 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   4686 	aese	v1.16b, v27.16b
   4687 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   4688 	ldr	q26, [x8, #192]					//load rk12
   4689 
   4690 	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
   4691 	aese	v4.16b, v27.16b
   4692 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   4693 	aese	v6.16b, v27.16b
   4694 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   4695 
   4696 	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
   4697 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   4698 	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
   4699 
   4700 	aese	v2.16b, v27.16b
   4701 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   4702 	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
   4703 	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
   4704 
   4705 .inst	0xce006900	//eor3 v0.16b, v8.16b, v0.16b, v26.16b				//AES block 8k+8 - result
   4706 	rev32	v25.16b, v30.16b					//CTR block 8k+19
   4707 	aese	v5.16b, v27.16b
   4708 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   4709 
   4710 	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
   4711 	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
   4712 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
   4713 
   4714 	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
   4715 	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
   4716 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   4717 
   4718 .inst	0xce016921	//eor3 v1.16b, v9.16b, v1.16b, v26.16b				//AES block 8k+9 - result
   4719 	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
   4720 .inst	0xce036963	//eor3 v3.16b, v11.16b, v3.16b, v26.16b				//AES block 8k+11 - result
   4721 
   4722 .inst	0xce026942	//eor3 v2.16b, v10.16b, v2.16b, v26.16b				//AES block 8k+10 - result
   4723 .inst	0xce0769e7	//eor3 v7.16b, v15.16b, v7.16b, v26.16b				//AES block 8k+15 - result
   4724 	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result
   4725 
   4726 .inst	0xce0569a5	//eor3 v5.16b, v13.16b, v5.16b, v26.16b				//AES block 8k+13 - result
   4727 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   4728 	mov	v3.16b, v25.16b					//CTR block 8k+19
   4729 
   4730 .inst	0xce046984	//eor3 v4.16b, v12.16b, v4.16b, v26.16b				//AES block 8k+12 - result
   4731 	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
   4732 	cmp	x0, x5				//.LOOP CONTROL
   4733 
   4734 .inst	0xce0669c6	//eor3 v6.16b, v14.16b, v6.16b, v26.16b				//AES block 8k+14 - result
   4735 	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
   4736 	mov	v0.16b, v20.16b					//CTR block 8k+16
   4737 
   4738 	mov	v1.16b, v22.16b					//CTR block 8k+17
   4739 	mov	v2.16b, v23.16b					//CTR block 8k+18
   4740 
   4741 	rev32	v4.16b, v30.16b				//CTR block 8k+20
   4742 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
   4743 	b.lt	.L192_dec_main_loop
   4744 
   4745 .L192_dec_prepretail:	//PREPRETAIL
   4746 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   4747 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   4748 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   4749 
   4750 	ldr	q23, [x3, #176]				//load h7l | h7h
   4751 	ext	v23.16b, v23.16b, v23.16b, #8
   4752 	ldr	q25, [x3, #208]				//load h8l | h8h
   4753 	ext	v25.16b, v25.16b, v25.16b, #8
   4754 	rev64	v8.16b, v8.16b						//GHASH block 8k
   4755 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   4756 
   4757 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   4758 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   4759 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   4760 
   4761 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   4762 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   4763 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   4764 
   4765 	ldr	q20, [x3, #128]				//load h5l | h5h
   4766 	ext	v20.16b, v20.16b, v20.16b, #8
   4767 	ldr	q22, [x3, #160]				//load h6l | h6h
   4768 	ext	v22.16b, v22.16b, v22.16b, #8
   4769 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   4770 
   4771 	aese	v0.16b, v26.16b
   4772 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   4773 	aese	v6.16b, v26.16b
   4774 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   4775 	aese	v5.16b, v26.16b
   4776 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   4777 
   4778 	aese	v3.16b, v26.16b
   4779 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   4780 	aese	v2.16b, v26.16b
   4781 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   4782 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   4783 
   4784 	aese	v4.16b, v26.16b
   4785 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   4786 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   4787 	aese	v1.16b, v26.16b
   4788 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   4789 
   4790 	aese	v6.16b, v27.16b
   4791 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   4792 	aese	v7.16b, v26.16b
   4793 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   4794 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   4795 
   4796 	aese	v4.16b, v27.16b
   4797 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   4798 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   4799 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   4800 
   4801 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   4802 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   4803 	aese	v3.16b, v27.16b
   4804 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   4805 
   4806 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   4807 	aese	v7.16b, v27.16b
   4808 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   4809 	aese	v0.16b, v27.16b
   4810 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   4811 
   4812 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   4813 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   4814 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   4815 
   4816 	aese	v2.16b, v27.16b
   4817 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   4818 	aese	v1.16b, v27.16b
   4819 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   4820 	aese	v5.16b, v27.16b
   4821 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   4822 
   4823 	ldr	q21, [x3, #144]				//load h6k | h5k
   4824 	ldr	q24, [x3, #192]				//load h8k | h7k
   4825 	aese	v3.16b, v28.16b
   4826 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   4827 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   4828 
   4829 	aese	v6.16b, v28.16b
   4830 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   4831 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   4832 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   4833 
   4834 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   4835 	aese	v4.16b, v28.16b
   4836 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   4837 	aese	v5.16b, v28.16b
   4838 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   4839 
   4840 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   4841 	aese	v3.16b, v26.16b
   4842 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   4843 	aese	v7.16b, v28.16b
   4844 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   4845 
   4846 	aese	v0.16b, v28.16b
   4847 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   4848 	aese	v2.16b, v28.16b
   4849 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   4850 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   4851 
   4852 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   4853 	aese	v1.16b, v28.16b
   4854 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   4855 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   4856 
   4857 	aese	v5.16b, v26.16b
   4858 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   4859 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   4860 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   4861 
   4862 	aese	v7.16b, v26.16b
   4863 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   4864 	aese	v6.16b, v26.16b
   4865 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   4866 	aese	v4.16b, v26.16b
   4867 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   4868 
   4869 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   4870 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   4871 	aese	v0.16b, v26.16b
   4872 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   4873 
   4874 	ldr	q23, [x3, #80]				//load h3l | h3h
   4875 	ext	v23.16b, v23.16b, v23.16b, #8
   4876 	ldr	q25, [x3, #112]				//load h4l | h4h
   4877 	ext	v25.16b, v25.16b, v25.16b, #8
   4878 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   4879 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   4880 
   4881 	ldr	q20, [x3, #32]				//load h1l | h1h
   4882 	ext	v20.16b, v20.16b, v20.16b, #8
   4883 	ldr	q22, [x3, #64]				//load h2l | h2h
   4884 	ext	v22.16b, v22.16b, v22.16b, #8
   4885 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   4886 	aese	v2.16b, v26.16b
   4887 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   4888 
   4889 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   4890 
   4891 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   4892 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   4893 
   4894 	aese	v5.16b, v27.16b
   4895 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   4896 	aese	v4.16b, v27.16b
   4897 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   4898 	aese	v1.16b, v26.16b
   4899 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   4900 
   4901 	aese	v2.16b, v27.16b
   4902 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   4903 	aese	v0.16b, v27.16b
   4904 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   4905 	aese	v3.16b, v27.16b
   4906 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   4907 
   4908 	aese	v1.16b, v27.16b
   4909 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   4910 	aese	v6.16b, v27.16b
   4911 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   4912 	aese	v7.16b, v27.16b
   4913 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   4914 
   4915 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   4916 	ldr	q21, [x3, #48]				//load h2k | h1k
   4917 	ldr	q24, [x3, #96]				//load h4k | h3k
   4918 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   4919 
   4920 	aese	v7.16b, v28.16b
   4921 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   4922 	aese	v1.16b, v28.16b
   4923 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   4924 	aese	v2.16b, v28.16b
   4925 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   4926 
   4927 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   4928 	aese	v6.16b, v28.16b
   4929 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   4930 	aese	v5.16b, v28.16b
   4931 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   4932 
   4933 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   4934 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   4935 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   4936 
   4937 	aese	v4.16b, v28.16b
   4938 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   4939 
   4940 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   4941 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   4942 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   4943 
   4944 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   4945 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   4946 	aese	v0.16b, v28.16b
   4947 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   4948 
   4949 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   4950 	aese	v3.16b, v28.16b
   4951 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   4952 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   4953 
   4954 	aese	v4.16b, v26.16b
   4955 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   4956 	aese	v2.16b, v26.16b
   4957 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   4958 
   4959 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   4960 	aese	v1.16b, v26.16b
   4961 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   4962 	aese	v7.16b, v26.16b
   4963 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   4964 
   4965 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   4966 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   4967 	aese	v0.16b, v26.16b
   4968 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   4969 
   4970 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   4971 	aese	v5.16b, v26.16b
   4972 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   4973 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   4974 
   4975 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   4976 	aese	v4.16b, v27.16b
   4977 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   4978 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   4979 
   4980 	aese	v3.16b, v26.16b
   4981 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   4982 	aese	v6.16b, v26.16b
   4983 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   4984 	aese	v5.16b, v27.16b
   4985 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   4986 
   4987 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   4988 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   4989 	aese	v2.16b, v27.16b
   4990 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   4991 
   4992 	ldr	d16, [x10]			//MODULO - load modulo constant
   4993 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   4994 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   4995 
   4996 	aese	v1.16b, v27.16b
   4997 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   4998 	aese	v7.16b, v27.16b
   4999 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   5000 	aese	v6.16b, v27.16b
   5001 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   5002 
   5003 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   5004 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   5005 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   5006 
   5007 	aese	v0.16b, v27.16b
   5008 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   5009 	aese	v3.16b, v27.16b
   5010 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   5011 
   5012 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   5013 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   5014 	aese	v2.16b, v28.16b
   5015 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   5016 
   5017 	aese	v6.16b, v28.16b
   5018 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   5019 	aese	v7.16b, v28.16b
   5020 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   5021 	aese	v1.16b, v28.16b
   5022 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   5023 
   5024 	aese	v3.16b, v28.16b
   5025 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   5026 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   5027 	aese	v0.16b, v28.16b
   5028 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   5029 
   5030 	aese	v5.16b, v28.16b
   5031 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   5032 	aese	v4.16b, v28.16b
   5033 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   5034 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   5035 
   5036 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   5037 	aese	v7.16b, v26.16b
   5038 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   5039 	aese	v6.16b, v26.16b
   5040 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   5041 
   5042 	aese	v5.16b, v26.16b
   5043 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   5044 	aese	v2.16b, v26.16b
   5045 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   5046 	aese	v3.16b, v26.16b
   5047 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   5048 
   5049 	aese	v0.16b, v26.16b
   5050 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   5051 	aese	v1.16b, v26.16b
   5052 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   5053 	aese	v4.16b, v26.16b
   5054 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   5055 
   5056 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   5057 	ldr	q26, [x8, #192]					//load rk12
   5058 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   5059 
   5060 	aese	v2.16b, v27.16b
   5061 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   5062 	aese	v5.16b, v27.16b
   5063 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   5064 	aese	v0.16b, v27.16b
   5065 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   5066 
   5067 	aese	v4.16b, v27.16b
   5068 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   5069 	aese	v6.16b, v27.16b
   5070 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   5071 	aese	v7.16b, v27.16b
   5072 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   5073 
   5074 	aese	v0.16b, v28.16b						//AES block 8k+8 - round 11
   5075 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   5076 	aese	v5.16b, v28.16b						//AES block 8k+13 - round 11
   5077 
   5078 	aese	v2.16b, v28.16b						//AES block 8k+10 - round 11
   5079 	aese	v3.16b, v27.16b
   5080 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   5081 	aese	v1.16b, v27.16b
   5082 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   5083 
   5084 	aese	v6.16b, v28.16b						//AES block 8k+14 - round 11
   5085 	aese	v4.16b, v28.16b						//AES block 8k+12 - round 11
   5086 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   5087 
   5088 	aese	v3.16b, v28.16b						//AES block 8k+11 - round 11
   5089 	aese	v1.16b, v28.16b						//AES block 8k+9 - round 11
   5090 	aese	v7.16b, v28.16b						//AES block 8k+15 - round 11
   5091 
   5092 .L192_dec_tail:	//TAIL
   5093 
   5094 	sub	x5, x4, x0 	//main_end_input_ptr is number of bytes left to process
   5095 
   5096 	ldp	q20, q21, [x3, #128]			//load h5l | h5h
   5097 	ext	v20.16b, v20.16b, v20.16b, #8
   5098 	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext
   5099 
   5100 	ldp	q24, q25, [x3, #192]			//load h8k | h7k
   5101 	ext	v25.16b, v25.16b, v25.16b, #8
   5102 
   5103 	mov	v29.16b, v26.16b
   5104 
   5105 	ldp	q22, q23, [x3, #160]			//load h6l | h6h
   5106 	ext	v22.16b, v22.16b, v22.16b, #8
   5107 	ext	v23.16b, v23.16b, v23.16b, #8
   5108 	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
   5109 
   5110 .inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
   5111 	cmp	x5, #112
   5112 	b.gt	.L192_dec_blocks_more_than_7
   5113 
   5114 	mov	v7.16b, v6.16b
   5115 	movi	v17.8b, #0
   5116 	sub	v30.4s, v30.4s, v31.4s
   5117 
   5118 	mov	v6.16b, v5.16b
   5119 	mov	v5.16b, v4.16b
   5120 	mov	v4.16b, v3.16b
   5121 
   5122 	cmp	x5, #96
   5123 	movi	v19.8b, #0
   5124 	mov	v3.16b, v2.16b
   5125 
   5126 	mov	v2.16b, v1.16b
   5127 	movi	v18.8b, #0
   5128 	b.gt	.L192_dec_blocks_more_than_6
   5129 
   5130 	mov	v7.16b, v6.16b
   5131 	mov	v6.16b, v5.16b
   5132 	mov	v5.16b, v4.16b
   5133 
   5134 	mov	v4.16b, v3.16b
   5135 	mov	v3.16b, v1.16b
   5136 
   5137 	sub	v30.4s, v30.4s, v31.4s
   5138 	cmp	x5, #80
   5139 	b.gt	.L192_dec_blocks_more_than_5
   5140 
   5141 	mov	v7.16b, v6.16b
   5142 	mov	v6.16b, v5.16b
   5143 
   5144 	mov	v5.16b, v4.16b
   5145 	mov	v4.16b, v1.16b
   5146 	cmp	x5, #64
   5147 
   5148 	sub	v30.4s, v30.4s, v31.4s
   5149 	b.gt	.L192_dec_blocks_more_than_4
   5150 
   5151 	sub	v30.4s, v30.4s, v31.4s
   5152 	mov	v7.16b, v6.16b
   5153 	mov	v6.16b, v5.16b
   5154 
   5155 	mov	v5.16b, v1.16b
   5156 	cmp	x5, #48
   5157 	b.gt	.L192_dec_blocks_more_than_3
   5158 
   5159 	sub	v30.4s, v30.4s, v31.4s
   5160 	mov	v7.16b, v6.16b
   5161 	cmp	x5, #32
   5162 
   5163 	mov	v6.16b, v1.16b
   5164 	ldr	q24, [x3, #96]				//load h4k | h3k
   5165 	b.gt	.L192_dec_blocks_more_than_2
   5166 
   5167 	sub	v30.4s, v30.4s, v31.4s
   5168 
   5169 	mov	v7.16b, v1.16b
   5170 	cmp	x5, #16
   5171 	b.gt	.L192_dec_blocks_more_than_1
   5172 
   5173 	sub	v30.4s, v30.4s, v31.4s
   5174 	ldr	q21, [x3, #48]				//load h2k | h1k
   5175 	b	.L192_dec_blocks_less_than_1
   5176 .L192_dec_blocks_more_than_7:	//blocks	left >  7
   5177 	rev64	v8.16b, v9.16b						//GHASH final-7 block
   5178 
   5179 	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
   5180 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5181 
   5182 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
   5183 	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
   5184 	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
   5185 
   5186 	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
   5187 
   5188 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
   5189 	st1	{ v12.16b}, [x2], #16			 	//AES final-7 block  - store result
   5190 
   5191 .inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result
   5192 
   5193 	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
   5194 	movi	v16.8b, #0						//suppress further partial tag feed in
   5195 .L192_dec_blocks_more_than_6:	//blocks	left >  6
   5196 
   5197 	rev64	v8.16b, v9.16b						//GHASH final-6 block
   5198 
   5199 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5200 
   5201 	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
   5202 	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
   5203 
   5204 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
   5205 	movi	v16.8b, #0						//suppress further partial tag feed in
   5206 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
   5207 
   5208 	st1	{ v12.16b}, [x2], #16			 	//AES final-6 block - store result
   5209 .inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
   5210 
   5211 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
   5212 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
   5213 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
   5214 
   5215 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
   5216 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
   5217 .L192_dec_blocks_more_than_5:	//blocks	left >  5
   5218 
   5219 	rev64	v8.16b, v9.16b						//GHASH final-5 block
   5220 
   5221 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5222 
   5223 	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
   5224 
   5225 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
   5226 
   5227 	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
   5228 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
   5229 
   5230 	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
   5231 
   5232 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
   5233 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
   5234 
   5235 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
   5236 
   5237 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
   5238 	movi	v16.8b, #0						//suppress further partial tag feed in
   5239 	st1	{ v12.16b}, [x2], #16			 	//AES final-5 block - store result
   5240 
   5241 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
   5242 .inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
   5243 .L192_dec_blocks_more_than_4:	//blocks	left >  4
   5244 
   5245 	rev64	v8.16b, v9.16b						//GHASH final-4 block
   5246 
   5247 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5248 	movi	v16.8b, #0						//suppress further partial tag feed in
   5249 
   5250 	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
   5251 	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
   5252 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
   5253 
   5254 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
   5255 
   5256 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
   5257 
   5258 	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
   5259 	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
   5260 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
   5261 
   5262 .inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
   5263 
   5264 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
   5265 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
   5266 .L192_dec_blocks_more_than_3:	//blocks	left >  3
   5267 
   5268 	ldr	q25, [x3, #112]				//load h4l | h4h
   5269 	ext	v25.16b, v25.16b, v25.16b, #8
   5270 	rev64	v8.16b, v9.16b						//GHASH final-3 block
   5271 	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
   5272 
   5273 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5274 
   5275 	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
   5276 	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
   5277 
   5278 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
   5279 	movi	v16.8b, #0						//suppress further partial tag feed in
   5280 	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
   5281 
   5282 	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
   5283 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
   5284 .inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
   5285 
   5286 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
   5287 	ldr	q24, [x3, #96]				//load h4k | h3k
   5288 
   5289 	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
   5290 
   5291 	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
   5292 
   5293 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
   5294 .L192_dec_blocks_more_than_2:	//blocks	left >  2
   5295 
   5296 	rev64	v8.16b, v9.16b						//GHASH final-2 block
   5297 	ldr	q23, [x3, #80]				//load h3l | h3h
   5298 	ext	v23.16b, v23.16b, v23.16b, #8
   5299 
   5300 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5301 
   5302 	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
   5303 	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext
   5304 
   5305 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
   5306 
   5307 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
   5308 
   5309 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
   5310 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
   5311 
   5312 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
   5313 	movi	v16.8b, #0						//suppress further partial tag feed in
   5314 
   5315 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
   5316 	st1	{ v12.16b}, [x2], #16			 	//AES final-2 block - store result
   5317 
   5318 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
   5319 .inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
   5320 .L192_dec_blocks_more_than_1:	//blocks	left >  1
   5321 
   5322 	rev64	v8.16b, v9.16b						//GHASH final-1 block
   5323 	ldr	q9, [x0], #16				//AES final block - load ciphertext
   5324 	ldr	q22, [x3, #64]				//load h1l | h1h
   5325 	ext	v22.16b, v22.16b, v22.16b, #8
   5326 
   5327 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5328 	movi	v16.8b, #0						//suppress further partial tag feed in
   5329 	ldr	q21, [x3, #48]				//load h2k | h1k
   5330 
   5331 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
   5332 	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
   5333 	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
   5334 
   5335 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
   5336 
   5337 .inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
   5338 
   5339 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
   5340 
   5341 	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
   5342 
   5343 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
   5344 
   5345 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
   5346 
   5347 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
   5348 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
   5349 .L192_dec_blocks_less_than_1:	//blocks	left <= 1
   5350 
   5351 	rev32	v30.16b, v30.16b
   5352 	and	x1, x1, #127				//bit_length %= 128
   5353 
   5354 	sub	x1, x1, #128				//bit_length -= 128
   5355 	str	q30, [x16]					//store the updated counter
   5356 
   5357 	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
   5358 	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
   5359 
   5360 	and	x1, x1, #127				//bit_length %= 128
   5361 
   5362 	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
   5363 	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
   5364 	cmp	x1, #64
   5365 
   5366 	csel	x13, x7, x6, lt
   5367 	csel	x14, x6, xzr, lt
   5368 	ldr	q20, [x3, #32]				//load h1l | h1h
   5369 	ext	v20.16b, v20.16b, v20.16b, #8
   5370 
   5371 	mov	v0.d[1], x14
   5372 	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
   5373 
   5374 	mov	v0.d[0], x13					//ctr0b is mask for last block
   5375 
   5376 	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
   5377 	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
   5378 
   5379 	rev64	v8.16b, v9.16b						//GHASH final block
   5380 
   5381 	st1	{ v12.16b}, [x2]				//store all 16B
   5382 
   5383 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   5384 
   5385 	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
   5386 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
   5387 
   5388 	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
   5389 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
   5390 	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
   5391 
   5392 	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
   5393 	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
   5394 
   5395 	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
   5396 	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
   5397 	ldr	d16, [x10]			//MODULO - load modulo constant
   5398 
   5399 	pmull	v21.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
   5400 	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   5401 
   5402 	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up
   5403 
   5404 .inst	0xce115652	//eor3 v18.16b, v18.16b, v17.16b, v21.16b			//MODULO - fold into mid
   5405 
   5406 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   5407 	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   5408 
   5409 .inst	0xce124673	//eor3 v19.16b, v19.16b, v18.16b, v17.16b			//MODULO - fold into low
   5410 	ext	v19.16b, v19.16b, v19.16b, #8
   5411 	rev64	v19.16b, v19.16b
   5412 	st1	{ v19.16b }, [x3]
   5413 
   5414 	mov	x0, x9
   5415 
   5416 	ldp	d10, d11, [sp, #16]
   5417 	ldp	d12, d13, [sp, #32]
   5418 	ldp	d14, d15, [sp, #48]
   5419 	ldp	d8, d9, [sp], #80
   5420 	ret
   5421 
   5422 .L192_dec_ret:
   5423 	mov	w0, #0x0
   5424 	ret
   5425 .size	unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
   5426 .globl	unroll8_eor3_aes_gcm_enc_256_kernel
   5427 .type	unroll8_eor3_aes_gcm_enc_256_kernel,%function
   5428 .align	4
   5429 unroll8_eor3_aes_gcm_enc_256_kernel:
   5430 	AARCH64_VALID_CALL_TARGET
   5431 	cbz	x1, .L256_enc_ret
   5432 	stp	d8, d9, [sp, #-80]!
   5433 	lsr	x9, x1, #3
   5434 	mov	x16, x4
   5435 	mov	x8, x5
   5436 	stp	d10, d11, [sp, #16]
   5437 	stp	d12, d13, [sp, #32]
   5438 	stp	d14, d15, [sp, #48]
   5439 	mov	x5, #0xc200000000000000
   5440 	stp	x5, xzr, [sp, #64]
   5441 	add	x10, sp, #64
   5442 
   5443 	ld1	{ v0.16b}, [x16]					//CTR block 0
   5444 
   5445 	mov	x5, x9
   5446 
   5447 	mov	x15, #0x100000000			//set up counter increment
   5448 	movi	v31.16b, #0x0
   5449 	mov	v31.d[1], x15
   5450 	sub	x5, x5, #1		//byte_len - 1
   5451 
   5452 	and	x5, x5, #0xffffffffffffff80	//number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   5453 
   5454 	add	x5, x5, x0
   5455 
   5456 	rev32	v30.16b, v0.16b				//set up reversed counter
   5457 
   5458 	add	v30.4s, v30.4s, v31.4s		//CTR block 0
   5459 
   5460 	rev32	v1.16b, v30.16b				//CTR block 1
   5461 	add	v30.4s, v30.4s, v31.4s		//CTR block 1
   5462 
   5463 	rev32	v2.16b, v30.16b				//CTR block 2
   5464 	add	v30.4s, v30.4s, v31.4s		//CTR block 2
   5465 
   5466 	rev32	v3.16b, v30.16b				//CTR block 3
   5467 	add	v30.4s, v30.4s, v31.4s		//CTR block 3
   5468 
   5469 	rev32	v4.16b, v30.16b				//CTR block 4
   5470 	add	v30.4s, v30.4s, v31.4s		//CTR block 4
   5471 
   5472 	rev32	v5.16b, v30.16b				//CTR block 5
   5473 	add	v30.4s, v30.4s, v31.4s		//CTR block 5
   5474 	ldp	q26, q27, [x8, #0]				 	//load rk0, rk1
   5475 
   5476 	rev32	v6.16b, v30.16b				//CTR block 6
   5477 	add	v30.4s, v30.4s, v31.4s		//CTR block 6
   5478 
   5479 	rev32	v7.16b, v30.16b				//CTR block 7
   5480 
   5481 	aese	v3.16b, v26.16b
   5482 	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
   5483 	aese	v4.16b, v26.16b
   5484 	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
   5485 	aese	v2.16b, v26.16b
   5486 	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
   5487 
   5488 	aese	v0.16b, v26.16b
   5489 	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
   5490 	aese	v1.16b, v26.16b
   5491 	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
   5492 	aese	v6.16b, v26.16b
   5493 	aesmc	v6.16b, v6.16b			//AES block 6 - round 0
   5494 
   5495 	aese	v5.16b, v26.16b
   5496 	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
   5497 	aese	v7.16b, v26.16b
   5498 	aesmc	v7.16b, v7.16b			//AES block 7 - round 0
   5499 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   5500 
   5501 	aese	v4.16b, v27.16b
   5502 	aesmc	v4.16b, v4.16b			//AES block 4 - round 1
   5503 	aese	v1.16b, v27.16b
   5504 	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
   5505 	aese	v3.16b, v27.16b
   5506 	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
   5507 
   5508 	aese	v6.16b, v27.16b
   5509 	aesmc	v6.16b, v6.16b			//AES block 6 - round 1
   5510 	aese	v5.16b, v27.16b
   5511 	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
   5512 
   5513 	aese	v2.16b, v27.16b
   5514 	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
   5515 
   5516 	aese	v7.16b, v27.16b
   5517 	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
   5518 
   5519 	aese	v2.16b, v28.16b
   5520 	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
   5521 	aese	v3.16b, v28.16b
   5522 	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
   5523 	aese	v0.16b, v27.16b
   5524 	aesmc	v0.16b, v0.16b			//AES block 0 - round 1
   5525 
   5526 	aese	v7.16b, v28.16b
   5527 	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
   5528 	aese	v6.16b, v28.16b
   5529 	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
   5530 	aese	v5.16b, v28.16b
   5531 	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
   5532 
   5533 	aese	v4.16b, v28.16b
   5534 	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
   5535 	aese	v0.16b, v28.16b
   5536 	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
   5537 	aese	v1.16b, v28.16b
   5538 	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
   5539 
   5540 	aese	v5.16b, v26.16b
   5541 	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
   5542 	aese	v3.16b, v26.16b
   5543 	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
   5544 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   5545 
   5546 	aese	v4.16b, v26.16b
   5547 	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
   5548 
   5549 	aese	v1.16b, v26.16b
   5550 	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
   5551 	aese	v6.16b, v26.16b
   5552 	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
   5553 	aese	v7.16b, v26.16b
   5554 	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
   5555 
   5556 	aese	v2.16b, v26.16b
   5557 	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
   5558 	aese	v0.16b, v26.16b
   5559 	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
   5560 
   5561 	aese	v4.16b, v27.16b
   5562 	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
   5563 	aese	v6.16b, v27.16b
   5564 	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
   5565 	aese	v1.16b, v27.16b
   5566 	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
   5567 
   5568 	aese	v2.16b, v27.16b
   5569 	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
   5570 	aese	v0.16b, v27.16b
   5571 	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
   5572 
   5573 	aese	v3.16b, v27.16b
   5574 	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
   5575 	aese	v7.16b, v27.16b
   5576 	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
   5577 	aese	v5.16b, v27.16b
   5578 	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
   5579 
   5580 	aese	v0.16b, v28.16b
   5581 	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
   5582 	aese	v2.16b, v28.16b
   5583 	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
   5584 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   5585 
   5586 	aese	v1.16b, v28.16b
   5587 	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
   5588 	aese	v4.16b, v28.16b
   5589 	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
   5590 	aese	v5.16b, v28.16b
   5591 	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
   5592 
   5593 	aese	v3.16b, v28.16b
   5594 	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
   5595 	aese	v6.16b, v28.16b
   5596 	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
   5597 	aese	v7.16b, v28.16b
   5598 	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
   5599 
   5600 	aese	v1.16b, v26.16b
   5601 	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
   5602 	aese	v5.16b, v26.16b
   5603 	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
   5604 	aese	v4.16b, v26.16b
   5605 	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
   5606 
   5607 	aese	v2.16b, v26.16b
   5608 	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
   5609 	aese	v6.16b, v26.16b
   5610 	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
   5611 	aese	v0.16b, v26.16b
   5612 	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
   5613 
   5614 	aese	v7.16b, v26.16b
   5615 	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
   5616 	aese	v3.16b, v26.16b
   5617 	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
   5618 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   5619 
   5620 	aese	v2.16b, v27.16b
   5621 	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
   5622 	aese	v0.16b, v27.16b
   5623 	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
   5624 
   5625 	aese	v7.16b, v27.16b
   5626 	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
   5627 	aese	v6.16b, v27.16b
   5628 	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
   5629 	aese	v1.16b, v27.16b
   5630 	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
   5631 
   5632 	aese	v5.16b, v27.16b
   5633 	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
   5634 	aese	v3.16b, v27.16b
   5635 	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
   5636 
   5637 	aese	v4.16b, v27.16b
   5638 	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
   5639 
   5640 	aese	v6.16b, v28.16b
   5641 	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
   5642 	aese	v1.16b, v28.16b
   5643 	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
   5644 
   5645 	aese	v3.16b, v28.16b
   5646 	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
   5647 	aese	v0.16b, v28.16b
   5648 	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
   5649 	aese	v7.16b, v28.16b
   5650 	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
   5651 
   5652 	aese	v5.16b, v28.16b
   5653 	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
   5654 	aese	v4.16b, v28.16b
   5655 	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
   5656 	aese	v2.16b, v28.16b
   5657 	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
   5658 
   5659 	ld1	{ v19.16b}, [x3]
   5660 	ext	v19.16b, v19.16b, v19.16b, #8
   5661 	rev64	v19.16b, v19.16b
   5662 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   5663 
   5664 	aese	v6.16b, v26.16b
   5665 	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
   5666 	aese	v7.16b, v26.16b
   5667 	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
   5668 	aese	v3.16b, v26.16b
   5669 	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
   5670 
   5671 	aese	v4.16b, v26.16b
   5672 	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
   5673 	aese	v5.16b, v26.16b
   5674 	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
   5675 	aese	v2.16b, v26.16b
   5676 	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
   5677 
   5678 	aese	v1.16b, v26.16b
   5679 	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
   5680 
   5681 	aese	v7.16b, v27.16b
   5682 	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
   5683 	aese	v4.16b, v27.16b
   5684 	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
   5685 	aese	v0.16b, v26.16b
   5686 	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
   5687 
   5688 	aese	v1.16b, v27.16b
   5689 	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
   5690 	aese	v5.16b, v27.16b
   5691 	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
   5692 	aese	v3.16b, v27.16b
   5693 	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
   5694 
   5695 	aese	v2.16b, v27.16b
   5696 	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
   5697 	aese	v0.16b, v27.16b
   5698 	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
   5699 	aese	v6.16b, v27.16b
   5700 	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
   5701 
   5702 	aese	v4.16b, v28.16b
   5703 	aesmc	v4.16b, v4.16b			//AES block 4 - round 11
   5704 	ldp	q26, q27, [x8, #192]				//load rk12, rk13
   5705 	aese	v5.16b, v28.16b
   5706 	aesmc	v5.16b, v5.16b			//AES block 5 - round 11
   5707 
   5708 	aese	v2.16b, v28.16b
   5709 	aesmc	v2.16b, v2.16b			//AES block 2 - round 11
   5710 	aese	v6.16b, v28.16b
   5711 	aesmc	v6.16b, v6.16b			//AES block 6 - round 11
   5712 	aese	v1.16b, v28.16b
   5713 	aesmc	v1.16b, v1.16b			//AES block 1 - round 11
   5714 
   5715 	aese	v0.16b, v28.16b
   5716 	aesmc	v0.16b, v0.16b			//AES block 0 - round 11
   5717 	aese	v3.16b, v28.16b
   5718 	aesmc	v3.16b, v3.16b			//AES block 3 - round 11
   5719 	aese	v7.16b, v28.16b
   5720 	aesmc	v7.16b, v7.16b			//AES block 7 - round 11
   5721 
   5722 	add	v30.4s, v30.4s, v31.4s		//CTR block 7
   5723 	ldr	q28, [x8, #224]					//load rk14
   5724 
   5725 	aese	v4.16b, v26.16b
   5726 	aesmc	v4.16b, v4.16b			//AES block 4 - round 12
   5727 	aese	v2.16b, v26.16b
   5728 	aesmc	v2.16b, v2.16b			//AES block 2 - round 12
   5729 	aese	v1.16b, v26.16b
   5730 	aesmc	v1.16b, v1.16b			//AES block 1 - round 12
   5731 
   5732 	aese	v0.16b, v26.16b
   5733 	aesmc	v0.16b, v0.16b			//AES block 0 - round 12
   5734 	aese	v5.16b, v26.16b
   5735 	aesmc	v5.16b, v5.16b			//AES block 5 - round 12
   5736 	aese	v3.16b, v26.16b
   5737 	aesmc	v3.16b, v3.16b			//AES block 3 - round 12
   5738 
   5739 	aese	v2.16b, v27.16b						//AES block 2 - round 13
   5740 	aese	v1.16b, v27.16b						//AES block 1 - round 13
   5741 	aese	v4.16b, v27.16b						//AES block 4 - round 13
   5742 
   5743 	aese	v6.16b, v26.16b
   5744 	aesmc	v6.16b, v6.16b			//AES block 6 - round 12
   5745 	aese	v7.16b, v26.16b
   5746 	aesmc	v7.16b, v7.16b			//AES block 7 - round 12
   5747 
   5748 	aese	v0.16b, v27.16b						//AES block 0 - round 13
   5749 	aese	v5.16b, v27.16b						//AES block 5 - round 13
   5750 
   5751 	aese	v6.16b, v27.16b						//AES block 6 - round 13
   5752 	aese	v7.16b, v27.16b						//AES block 7 - round 13
   5753 	aese	v3.16b, v27.16b						//AES block 3 - round 13
   5754 
   5755 	add	x4, x0, x1, lsr #3		//end_input_ptr
   5756 	cmp	x0, x5				//check if we have <= 8 blocks
   5757 	b.ge	.L256_enc_tail						//handle tail
   5758 
   5759 	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load plaintext
   5760 
   5761 	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load plaintext
   5762 
   5763 .inst	0xce007108	//eor3 v8.16b, v8.16b, v0.16b, v28.16b				//AES block 0 - result
   5764 	rev32	v0.16b, v30.16b				//CTR block 8
   5765 	add	v30.4s, v30.4s, v31.4s		//CTR block 8
   5766 
   5767 .inst	0xce017129	//eor3 v9.16b, v9.16b, v1.16b, v28.16b				//AES block 1 - result
   5768 .inst	0xce03716b	//eor3 v11.16b, v11.16b, v3.16b, v28.16b				//AES block 3 - result
   5769 
   5770 	rev32	v1.16b, v30.16b				//CTR block 9
   5771 	add	v30.4s, v30.4s, v31.4s		//CTR block 9
   5772 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
   5773 
   5774 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
   5775 .inst	0xce02714a	//eor3 v10.16b, v10.16b, v2.16b, v28.16b				//AES block 2 - result
   5776 	cmp	x0, x5				//check if we have <= 8 blocks
   5777 
   5778 	rev32	v2.16b, v30.16b				//CTR block 10
   5779 	add	v30.4s, v30.4s, v31.4s		//CTR block 10
   5780 	stp	q8, q9, [x2], #32			//AES block 0, 1 - store result
   5781 
   5782 	stp	q10, q11, [x2], #32			//AES block 2, 3 - store result
   5783 
   5784 	rev32	v3.16b, v30.16b				//CTR block 11
   5785 	add	v30.4s, v30.4s, v31.4s		//CTR block 11
   5786 
   5787 .inst	0xce04718c	//eor3 v12.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
   5788 
   5789 .inst	0xce0771ef	//eor3 v15.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
   5790 .inst	0xce0671ce	//eor3 v14.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
   5791 .inst	0xce0571ad	//eor3 v13.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result
   5792 
   5793 	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
   5794 	rev32	v4.16b, v30.16b				//CTR block 12
   5795 
   5796 	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
   5797 	add	v30.4s, v30.4s, v31.4s		//CTR block 12
   5798 	b.ge	.L256_enc_prepretail					//do prepretail
   5799 
   5800 .L256_enc_main_loop:	//main	loop start
   5801 	ldp	q26, q27, [x8, #0]					//load rk0, rk1
   5802 
   5803 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   5804 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   5805 	ldr	q21, [x3, #144]				//load h6k | h5k
   5806 	ldr	q24, [x3, #192]				//load h8k | h7k
   5807 
   5808 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   5809 	ldr	q20, [x3, #128]				//load h5l | h5h
   5810 	ext	v20.16b, v20.16b, v20.16b, #8
   5811 	ldr	q22, [x3, #160]				//load h6l | h6h
   5812 	ext	v22.16b, v22.16b, v22.16b, #8
   5813 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   5814 
   5815 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   5816 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   5817 	rev64	v8.16b, v8.16b						//GHASH block 8k
   5818 
   5819 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   5820 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   5821 	ldr	q23, [x3, #176]				//load h7l | h7h
   5822 	ext	v23.16b, v23.16b, v23.16b, #8
   5823 	ldr	q25, [x3, #208]				//load h8l | h8h
   5824 	ext	v25.16b, v25.16b, v25.16b, #8
   5825 
   5826 	aese	v3.16b, v26.16b
   5827 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   5828 	aese	v5.16b, v26.16b
   5829 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   5830 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   5831 
   5832 	aese	v0.16b, v26.16b
   5833 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   5834 	aese	v1.16b, v26.16b
   5835 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   5836 	aese	v6.16b, v26.16b
   5837 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   5838 
   5839 	aese	v7.16b, v26.16b
   5840 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   5841 	aese	v2.16b, v26.16b
   5842 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   5843 	aese	v4.16b, v26.16b
   5844 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   5845 
   5846 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   5847 	eor	v8.16b, v8.16b, v19.16b				 	//PRE 1
   5848 	aese	v6.16b, v27.16b
   5849 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   5850 
   5851 	aese	v2.16b, v27.16b
   5852 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   5853 	aese	v1.16b, v27.16b
   5854 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   5855 	aese	v0.16b, v27.16b
   5856 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   5857 
   5858 	aese	v4.16b, v27.16b
   5859 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   5860 	aese	v3.16b, v27.16b
   5861 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   5862 	aese	v5.16b, v27.16b
   5863 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   5864 
   5865 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   5866 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   5867 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   5868 
   5869 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   5870 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   5871 	aese	v7.16b, v27.16b
   5872 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   5873 
   5874 	aese	v1.16b, v28.16b
   5875 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   5876 	aese	v5.16b, v28.16b
   5877 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   5878 	aese	v6.16b, v28.16b
   5879 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   5880 
   5881 	aese	v2.16b, v28.16b
   5882 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   5883 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   5884 	aese	v4.16b, v28.16b
   5885 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   5886 
   5887 	aese	v5.16b, v26.16b
   5888 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   5889 	aese	v6.16b, v26.16b
   5890 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   5891 	aese	v0.16b, v28.16b
   5892 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   5893 
   5894 	aese	v1.16b, v26.16b
   5895 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   5896 	aese	v7.16b, v28.16b
   5897 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   5898 	aese	v3.16b, v28.16b
   5899 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   5900 
   5901 	aese	v4.16b, v26.16b
   5902 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   5903 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   5904 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   5905 
   5906 	aese	v3.16b, v26.16b
   5907 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   5908 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   5909 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   5910 
   5911 	aese	v2.16b, v26.16b
   5912 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   5913 	aese	v7.16b, v26.16b
   5914 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   5915 	aese	v0.16b, v26.16b
   5916 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   5917 
   5918 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   5919 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   5920 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   5921 
   5922 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   5923 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   5924 	ldr	q23, [x3, #80]				//load h3l | h3h
   5925 	ext	v23.16b, v23.16b, v23.16b, #8
   5926 	ldr	q25, [x3, #112]				//load h4l | h4h
   5927 	ext	v25.16b, v25.16b, v25.16b, #8
   5928 
   5929 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   5930 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   5931 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   5932 
   5933 	aese	v4.16b, v27.16b
   5934 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   5935 	aese	v1.16b, v27.16b
   5936 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   5937 	aese	v5.16b, v27.16b
   5938 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   5939 
   5940 	aese	v7.16b, v27.16b
   5941 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   5942 	aese	v3.16b, v27.16b
   5943 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   5944 	aese	v2.16b, v27.16b
   5945 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   5946 
   5947 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   5948 	aese	v6.16b, v27.16b
   5949 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   5950 	aese	v0.16b, v27.16b
   5951 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   5952 
   5953 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   5954 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   5955 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   5956 
   5957 	aese	v5.16b, v28.16b
   5958 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   5959 	aese	v7.16b, v28.16b
   5960 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   5961 	aese	v4.16b, v28.16b
   5962 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   5963 
   5964 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   5965 	aese	v2.16b, v28.16b
   5966 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   5967 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   5968 
   5969 	aese	v3.16b, v28.16b
   5970 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   5971 	aese	v6.16b, v28.16b
   5972 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   5973 	aese	v1.16b, v28.16b
   5974 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   5975 
   5976 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   5977 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   5978 	aese	v0.16b, v28.16b
   5979 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   5980 
   5981 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   5982 	aese	v4.16b, v26.16b
   5983 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   5984 	aese	v2.16b, v26.16b
   5985 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   5986 
   5987 	aese	v6.16b, v26.16b
   5988 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   5989 	aese	v1.16b, v26.16b
   5990 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   5991 	aese	v7.16b, v26.16b
   5992 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   5993 
   5994 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   5995 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   5996 	aese	v5.16b, v26.16b
   5997 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   5998 
   5999 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   6000 	aese	v3.16b, v26.16b
   6001 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   6002 	aese	v0.16b, v26.16b
   6003 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   6004 
   6005 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   6006 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   6007 	aese	v5.16b, v27.16b
   6008 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   6009 
   6010 	ldr	q20, [x3, #32]				//load h1l | h1h
   6011 	ext	v20.16b, v20.16b, v20.16b, #8
   6012 	ldr	q22, [x3, #64]				//load h2l | h2h
   6013 	ext	v22.16b, v22.16b, v22.16b, #8
   6014 	aese	v2.16b, v27.16b
   6015 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   6016 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   6017 
   6018 	ldr	q21, [x3, #48]				//load h2k | h1k
   6019 	ldr	q24, [x3, #96]				//load h4k | h3k
   6020 	aese	v6.16b, v27.16b
   6021 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   6022 	aese	v3.16b, v27.16b
   6023 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   6024 
   6025 	aese	v0.16b, v27.16b
   6026 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   6027 	aese	v7.16b, v27.16b
   6028 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   6029 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   6030 
   6031 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   6032 	aese	v4.16b, v27.16b
   6033 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   6034 	aese	v1.16b, v27.16b
   6035 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   6036 
   6037 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   6038 	aese	v7.16b, v28.16b
   6039 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   6040 	aese	v0.16b, v28.16b
   6041 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   6042 
   6043 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   6044 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   6045 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   6046 
   6047 	aese	v3.16b, v28.16b
   6048 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   6049 	aese	v0.16b, v26.16b
   6050 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   6051 	aese	v1.16b, v28.16b
   6052 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   6053 
   6054 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   6055 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   6056 	aese	v2.16b, v28.16b
   6057 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   6058 
   6059 	aese	v5.16b, v28.16b
   6060 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   6061 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   6062 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   6063 
   6064 	aese	v6.16b, v28.16b
   6065 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   6066 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   6067 	aese	v4.16b, v28.16b
   6068 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   6069 
   6070 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   6071 	aese	v7.16b, v26.16b
   6072 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   6073 	aese	v5.16b, v26.16b
   6074 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   6075 
   6076 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   6077 	aese	v6.16b, v26.16b
   6078 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   6079 	aese	v4.16b, v26.16b
   6080 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   6081 
   6082 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   6083 	aese	v2.16b, v26.16b
   6084 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   6085 	aese	v3.16b, v26.16b
   6086 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   6087 
   6088 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   6089 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   6090 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   6091 
   6092 	ldr	d16, [x10]			//MODULO - load modulo constant
   6093 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   6094 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   6095 
   6096 	aese	v1.16b, v26.16b
   6097 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   6098 
   6099 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   6100 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   6101 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   6102 
   6103 	aese	v4.16b, v27.16b
   6104 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   6105 	aese	v3.16b, v27.16b
   6106 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   6107 	aese	v5.16b, v27.16b
   6108 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   6109 
   6110 	aese	v0.16b, v27.16b
   6111 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   6112 	aese	v2.16b, v27.16b
   6113 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   6114 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   6115 
   6116 	aese	v1.16b, v27.16b
   6117 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   6118 	aese	v7.16b, v27.16b
   6119 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   6120 	aese	v6.16b, v27.16b
   6121 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   6122 
   6123 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   6124 
   6125 	ldp	q26, q27, [x8, #192]				//load rk12, rk13
   6126 	rev32	v20.16b, v30.16b					//CTR block 8k+16
   6127 
   6128 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   6129 	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load plaintext
   6130 	aese	v2.16b, v28.16b
   6131 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
   6132 
   6133 	aese	v6.16b, v28.16b
   6134 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
   6135 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
   6136 	aese	v3.16b, v28.16b
   6137 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
   6138 
   6139 	aese	v0.16b, v28.16b
   6140 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
   6141 	aese	v7.16b, v28.16b
   6142 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
   6143 
   6144 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   6145 	aese	v1.16b, v28.16b
   6146 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
   6147 
   6148 	aese	v7.16b, v26.16b
   6149 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
   6150 	aese	v5.16b, v28.16b
   6151 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
   6152 
   6153 	aese	v3.16b, v26.16b
   6154 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
   6155 	aese	v6.16b, v26.16b
   6156 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
   6157 	rev32	v22.16b, v30.16b					//CTR block 8k+17
   6158 
   6159 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
   6160 	aese	v4.16b, v28.16b
   6161 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
   6162 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   6163 
   6164 	aese	v5.16b, v26.16b
   6165 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
   6166 	ldr	q28, [x8, #224]					//load rk14
   6167 	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
   6168 
   6169 	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load plaintext
   6170 	aese	v2.16b, v26.16b
   6171 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
   6172 	aese	v4.16b, v26.16b
   6173 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
   6174 
   6175 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   6176 	aese	v1.16b, v26.16b
   6177 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
   6178 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load plaintext
   6179 
   6180 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load plaintext
   6181 	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
   6182 	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
   6183 
   6184 	rev32	v23.16b, v30.16b					//CTR block 8k+18
   6185 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
   6186 	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
   6187 
   6188 	aese	v0.16b, v26.16b
   6189 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
   6190 	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
   6191 	cmp	x0, x5				//.LOOP CONTROL
   6192 
   6193 .inst	0xce02714a	//eor3 v10.16b, v10.16b, v2.16b, v28.16b				//AES block 8k+10 - result
   6194 	rev32	v25.16b, v30.16b					//CTR block 8k+19
   6195 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
   6196 
   6197 	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
   6198 	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
   6199 .inst	0xce0571ad	//eor3 v13.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result
   6200 
   6201 	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   6202 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   6203 	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
   6204 
   6205 .inst	0xce04718c	//eor3 v12.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
   6206 	rev32	v4.16b, v30.16b				//CTR block 8k+20
   6207 .inst	0xce03716b	//eor3 v11.16b, v11.16b, v3.16b, v28.16b				//AES block 8k+11 - result
   6208 
   6209 	mov	v3.16b, v25.16b					//CTR block 8k+19
   6210 .inst	0xce017129	//eor3 v9.16b, v9.16b, v1.16b, v28.16b				//AES block 8k+9 - result
   6211 .inst	0xce007108	//eor3 v8.16b, v8.16b, v0.16b, v28.16b				//AES block 8k+8 - result
   6212 
   6213 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
   6214 	stp	q8, q9, [x2], #32			//AES block 8k+8, 8k+9 - store result
   6215 	mov	v2.16b, v23.16b					//CTR block 8k+18
   6216 
   6217 .inst	0xce0771ef	//eor3 v15.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
   6218 .inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
   6219 	stp	q10, q11, [x2], #32			//AES block 8k+10, 8k+11 - store result
   6220 
   6221 .inst	0xce0671ce	//eor3 v14.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
   6222 	mov	v1.16b, v22.16b					//CTR block 8k+17
   6223 	stp	q12, q13, [x2], #32			//AES block 4, 5 - store result
   6224 
   6225 	stp	q14, q15, [x2], #32			//AES block 6, 7 - store result
   6226 	mov	v0.16b, v20.16b					//CTR block 8k+16
   6227 	b.lt	.L256_enc_main_loop
   6228 
   6229 .L256_enc_prepretail:	//PREPRETAIL
   6230 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   6231 	ldp	q26, q27, [x8, #0]					//load rk0, rk1
   6232 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   6233 
   6234 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   6235 
   6236 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   6237 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   6238 
   6239 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   6240 	ldr	q21, [x3, #144]				//load h6k | h5k
   6241 	ldr	q24, [x3, #192]				//load h8k | h7k
   6242 
   6243 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   6244 
   6245 	aese	v6.16b, v26.16b
   6246 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   6247 	aese	v4.16b, v26.16b
   6248 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   6249 	aese	v1.16b, v26.16b
   6250 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   6251 
   6252 	aese	v5.16b, v26.16b
   6253 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   6254 	aese	v0.16b, v26.16b
   6255 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   6256 
   6257 	aese	v2.16b, v26.16b
   6258 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   6259 	aese	v7.16b, v26.16b
   6260 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   6261 	aese	v3.16b, v26.16b
   6262 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   6263 
   6264 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   6265 	rev64	v8.16b, v8.16b						//GHASH block 8k
   6266 	aese	v1.16b, v27.16b
   6267 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   6268 
   6269 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   6270 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   6271 	aese	v3.16b, v27.16b
   6272 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   6273 
   6274 	ldr	q23, [x3, #176]				//load h7l | h7h
   6275 	ext	v23.16b, v23.16b, v23.16b, #8
   6276 	ldr	q25, [x3, #208]				//load h8l | h8h
   6277 	ext	v25.16b, v25.16b, v25.16b, #8
   6278 	aese	v2.16b, v27.16b
   6279 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   6280 
   6281 	ldr	q20, [x3, #128]				//load h5l | h5h
   6282 	ext	v20.16b, v20.16b, v20.16b, #8
   6283 	ldr	q22, [x3, #160]				//load h6l | h6h
   6284 	ext	v22.16b, v22.16b, v22.16b, #8
   6285 	aese	v0.16b, v27.16b
   6286 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   6287 	aese	v5.16b, v27.16b
   6288 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   6289 
   6290 	aese	v4.16b, v27.16b
   6291 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   6292 	eor	v8.16b, v8.16b, v19.16b					//PRE 1
   6293 
   6294 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   6295 	aese	v6.16b, v27.16b
   6296 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   6297 
   6298 	aese	v1.16b, v28.16b
   6299 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   6300 	aese	v2.16b, v28.16b
   6301 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   6302 	aese	v7.16b, v27.16b
   6303 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   6304 
   6305 	aese	v4.16b, v28.16b
   6306 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   6307 	aese	v0.16b, v28.16b
   6308 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   6309 	aese	v6.16b, v28.16b
   6310 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   6311 
   6312 	aese	v5.16b, v28.16b
   6313 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   6314 	aese	v7.16b, v28.16b
   6315 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   6316 	aese	v3.16b, v28.16b
   6317 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   6318 
   6319 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   6320 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   6321 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   6322 
   6323 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   6324 	aese	v4.16b, v26.16b
   6325 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   6326 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   6327 
   6328 	aese	v7.16b, v26.16b
   6329 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   6330 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   6331 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   6332 
   6333 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   6334 	aese	v6.16b, v26.16b
   6335 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   6336 
   6337 	aese	v2.16b, v26.16b
   6338 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   6339 	aese	v3.16b, v26.16b
   6340 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   6341 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   6342 
   6343 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   6344 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   6345 	aese	v1.16b, v26.16b
   6346 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   6347 
   6348 	aese	v0.16b, v26.16b
   6349 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   6350 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   6351 	aese	v5.16b, v26.16b
   6352 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   6353 
   6354 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   6355 	aese	v1.16b, v27.16b
   6356 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   6357 	aese	v6.16b, v27.16b
   6358 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   6359 
   6360 	aese	v0.16b, v27.16b
   6361 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   6362 	aese	v2.16b, v27.16b
   6363 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   6364 	aese	v4.16b, v27.16b
   6365 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   6366 
   6367 	aese	v6.16b, v28.16b
   6368 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   6369 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   6370 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   6371 
   6372 	aese	v7.16b, v27.16b
   6373 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   6374 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   6375 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   6376 
   6377 	aese	v5.16b, v27.16b
   6378 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   6379 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   6380 	aese	v3.16b, v27.16b
   6381 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   6382 
   6383 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   6384 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   6385 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   6386 
   6387 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   6388 	aese	v1.16b, v28.16b
   6389 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   6390 	aese	v0.16b, v28.16b
   6391 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   6392 
   6393 	aese	v7.16b, v28.16b
   6394 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   6395 	aese	v4.16b, v28.16b
   6396 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   6397 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   6398 
   6399 	ldr	q23, [x3, #80]				//load h3l | h3h
   6400 	ext	v23.16b, v23.16b, v23.16b, #8
   6401 	ldr	q25, [x3, #112]				//load h4l | h4h
   6402 	ext	v25.16b, v25.16b, v25.16b, #8
   6403 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   6404 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   6405 
   6406 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   6407 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   6408 
   6409 	aese	v5.16b, v28.16b
   6410 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   6411 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   6412 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   6413 
   6414 	aese	v3.16b, v28.16b
   6415 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   6416 	aese	v2.16b, v28.16b
   6417 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   6418 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   6419 
   6420 	aese	v7.16b, v26.16b
   6421 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   6422 	aese	v4.16b, v26.16b
   6423 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   6424 	aese	v6.16b, v26.16b
   6425 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   6426 
   6427 	ldr	q21, [x3, #48]				//load h2k | h1k
   6428 	ldr	q24, [x3, #96]				//load h4k | h3k
   6429 	aese	v5.16b, v26.16b
   6430 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   6431 	aese	v3.16b, v26.16b
   6432 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   6433 
   6434 	aese	v0.16b, v26.16b
   6435 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   6436 	aese	v1.16b, v26.16b
   6437 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   6438 	aese	v2.16b, v26.16b
   6439 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   6440 
   6441 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   6442 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   6443 	ldr	q20, [x3, #32]				//load h1l | h1h
   6444 	ext	v20.16b, v20.16b, v20.16b, #8
   6445 	ldr	q22, [x3, #64]				//load h2l | h2h
   6446 	ext	v22.16b, v22.16b, v22.16b, #8
   6447 
   6448 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   6449 	aese	v1.16b, v27.16b
   6450 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   6451 	aese	v4.16b, v27.16b
   6452 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   6453 
   6454 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   6455 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   6456 
   6457 	aese	v5.16b, v27.16b
   6458 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   6459 	aese	v6.16b, v27.16b
   6460 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   6461 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   6462 
   6463 	aese	v7.16b, v27.16b
   6464 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   6465 	aese	v3.16b, v27.16b
   6466 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   6467 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   6468 
   6469 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   6470 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   6471 	aese	v2.16b, v27.16b
   6472 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   6473 
   6474 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   6475 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   6476 	aese	v0.16b, v27.16b
   6477 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   6478 
   6479 	aese	v7.16b, v28.16b
   6480 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   6481 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   6482 	aese	v2.16b, v28.16b
   6483 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   6484 
   6485 	aese	v6.16b, v28.16b
   6486 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   6487 	aese	v4.16b, v28.16b
   6488 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   6489 	aese	v3.16b, v28.16b
   6490 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   6491 
   6492 	aese	v5.16b, v28.16b
   6493 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   6494 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   6495 	aese	v0.16b, v28.16b
   6496 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   6497 
   6498 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   6499 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   6500 	aese	v1.16b, v28.16b
   6501 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   6502 
   6503 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   6504 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   6505 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   6506 
   6507 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   6508 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   6509 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   6510 
   6511 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   6512 	aese	v1.16b, v26.16b
   6513 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   6514 	aese	v0.16b, v26.16b
   6515 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   6516 
   6517 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   6518 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   6519 	ldr	d16, [x10]			//MODULO - load modulo constant
   6520 
   6521 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   6522 
   6523 	aese	v3.16b, v26.16b
   6524 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   6525 	aese	v7.16b, v26.16b
   6526 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   6527 	aese	v5.16b, v26.16b
   6528 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   6529 
   6530 	aese	v2.16b, v26.16b
   6531 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   6532 	aese	v6.16b, v26.16b
   6533 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   6534 
   6535 	aese	v5.16b, v27.16b
   6536 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   6537 	aese	v1.16b, v27.16b
   6538 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   6539 	aese	v4.16b, v26.16b
   6540 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   6541 
   6542 	aese	v7.16b, v27.16b
   6543 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   6544 	aese	v6.16b, v27.16b
   6545 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   6546 	aese	v3.16b, v27.16b
   6547 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   6548 
   6549 	aese	v4.16b, v27.16b
   6550 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   6551 	aese	v0.16b, v27.16b
   6552 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   6553 	aese	v2.16b, v27.16b
   6554 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   6555 
   6556 	pmull	v29.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   6557 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   6558 	aese	v7.16b, v28.16b
   6559 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
   6560 
   6561 	ldp	q26, q27, [x8, #192]				//load rk12, rk13
   6562 	ext	v21.16b, v17.16b, v17.16b, #8			 	//MODULO - other top alignment
   6563 	aese	v2.16b, v28.16b
   6564 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
   6565 
   6566 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   6567 	aese	v1.16b, v28.16b
   6568 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
   6569 	aese	v6.16b, v28.16b
   6570 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
   6571 
   6572 	aese	v0.16b, v28.16b
   6573 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
   6574 	aese	v4.16b, v28.16b
   6575 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
   6576 	aese	v5.16b, v28.16b
   6577 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
   6578 
   6579 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   6580 	aese	v3.16b, v28.16b
   6581 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
   6582 	ldr	q28, [x8, #224]					//load rk14
   6583 
   6584 	aese	v1.16b, v26.16b
   6585 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
   6586 	aese	v2.16b, v26.16b
   6587 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
   6588 	aese	v0.16b, v26.16b
   6589 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
   6590 
   6591 	aese	v6.16b, v26.16b
   6592 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
   6593 	aese	v5.16b, v26.16b
   6594 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
   6595 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   6596 
   6597 	aese	v4.16b, v26.16b
   6598 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
   6599 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   6600 
   6601 	aese	v3.16b, v26.16b
   6602 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
   6603 	aese	v7.16b, v26.16b
   6604 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
   6605 	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
   6606 
   6607 .inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
   6608 	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
   6609 	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
   6610 
   6611 	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
   6612 	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
   6613 	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
   6614 
   6615 	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
   6616 	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
   6617 .L256_enc_tail:	//TAIL
   6618 
   6619 	ldp	q24, q25, [x3, #192]			//load h8l | h8h
   6620 	ext	v25.16b, v25.16b, v25.16b, #8
   6621 	sub	x5, x4, x0		//main_end_input_ptr is number of bytes left to process
   6622 
   6623 	ldr	q8, [x0], #16				//AES block 8k+8 - load plaintext
   6624 
   6625 	ldp	q20, q21, [x3, #128]			//load h5l | h5h
   6626 	ext	v20.16b, v20.16b, v20.16b, #8
   6627 
   6628 	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
   6629 	ldp	q22, q23, [x3, #160]			//load h6l | h6h
   6630 	ext	v22.16b, v22.16b, v22.16b, #8
   6631 	ext	v23.16b, v23.16b, v23.16b, #8
   6632 	mov	v29.16b, v28.16b
   6633 
   6634 	cmp	x5, #112
   6635 .inst	0xce007509	//eor3 v9.16b, v8.16b, v0.16b, v29.16b				//AES block 8k+8 - result
   6636 	b.gt	.L256_enc_blocks_more_than_7
   6637 
   6638 	movi	v19.8b, #0
   6639 	mov	v7.16b, v6.16b
   6640 	movi	v17.8b, #0
   6641 
   6642 	mov	v6.16b, v5.16b
   6643 	mov	v5.16b, v4.16b
   6644 	mov	v4.16b, v3.16b
   6645 
   6646 	mov	v3.16b, v2.16b
   6647 	sub	v30.4s, v30.4s, v31.4s
   6648 	mov	v2.16b, v1.16b
   6649 
   6650 	movi	v18.8b, #0
   6651 	cmp	x5, #96
   6652 	b.gt	.L256_enc_blocks_more_than_6
   6653 
   6654 	mov	v7.16b, v6.16b
   6655 	mov	v6.16b, v5.16b
   6656 	cmp	x5, #80
   6657 
   6658 	mov	v5.16b, v4.16b
   6659 	mov	v4.16b, v3.16b
   6660 	mov	v3.16b, v1.16b
   6661 
   6662 	sub	v30.4s, v30.4s, v31.4s
   6663 	b.gt	.L256_enc_blocks_more_than_5
   6664 
   6665 	mov	v7.16b, v6.16b
   6666 	sub	v30.4s, v30.4s, v31.4s
   6667 
   6668 	mov	v6.16b, v5.16b
   6669 	mov	v5.16b, v4.16b
   6670 
   6671 	cmp	x5, #64
   6672 	mov	v4.16b, v1.16b
   6673 	b.gt	.L256_enc_blocks_more_than_4
   6674 
   6675 	cmp	x5, #48
   6676 	mov	v7.16b, v6.16b
   6677 	mov	v6.16b, v5.16b
   6678 
   6679 	mov	v5.16b, v1.16b
   6680 	sub	v30.4s, v30.4s, v31.4s
   6681 	b.gt	.L256_enc_blocks_more_than_3
   6682 
   6683 	cmp	x5, #32
   6684 	mov	v7.16b, v6.16b
   6685 	ldr	q24, [x3, #96]				//load h4k | h3k
   6686 
   6687 	mov	v6.16b, v1.16b
   6688 	sub	v30.4s, v30.4s, v31.4s
   6689 	b.gt	.L256_enc_blocks_more_than_2
   6690 
   6691 	mov	v7.16b, v1.16b
   6692 
   6693 	sub	v30.4s, v30.4s, v31.4s
   6694 	cmp	x5, #16
   6695 	b.gt	.L256_enc_blocks_more_than_1
   6696 
   6697 	sub	v30.4s, v30.4s, v31.4s
   6698 	ldr	q21, [x3, #48]				//load h2k | h1k
   6699 	b	.L256_enc_blocks_less_than_1
   6700 .L256_enc_blocks_more_than_7:	//blocks	left >  7
   6701 	st1	{ v9.16b}, [x2], #16				//AES final-7 block  - store result
   6702 
   6703 	rev64	v8.16b, v9.16b						//GHASH final-7 block
   6704 
   6705 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6706 
   6707 	ldr	q9, [x0], #16				//AES final-6 block - load plaintext
   6708 
   6709 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
   6710 	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
   6711 	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
   6712 
   6713 	movi	v16.8b, #0						//suppress further partial tag feed in
   6714 
   6715 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
   6716 .inst	0xce017529	//eor3 v9.16b, v9.16b, v1.16b, v29.16b			//AES final-6 block - result
   6717 
   6718 	pmull	v18.1q, v27.1d, v18.1d				//GHASH final-7 block - mid
   6719 	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
   6720 .L256_enc_blocks_more_than_6:	//blocks	left >  6
   6721 
   6722 	st1	{ v9.16b}, [x2], #16				//AES final-6 block - store result
   6723 
   6724 	rev64	v8.16b, v9.16b						//GHASH final-6 block
   6725 
   6726 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6727 
   6728 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
   6729 	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
   6730 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
   6731 
   6732 	ldr	q9, [x0], #16				//AES final-5 block - load plaintext
   6733 
   6734 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
   6735 
   6736 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
   6737 
   6738 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
   6739 .inst	0xce027529	//eor3 v9.16b, v9.16b, v2.16b, v29.16b			//AES final-5 block - result
   6740 
   6741 	movi	v16.8b, #0						//suppress further partial tag feed in
   6742 
   6743 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
   6744 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
   6745 .L256_enc_blocks_more_than_5:	//blocks	left >  5
   6746 
   6747 	st1	{ v9.16b}, [x2], #16				//AES final-5 block - store result
   6748 
   6749 	rev64	v8.16b, v9.16b						//GHASH final-5 block
   6750 
   6751 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6752 
   6753 	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
   6754 
   6755 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
   6756 
   6757 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
   6758 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
   6759 
   6760 	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
   6761 
   6762 	ldr	q9, [x0], #16				//AES final-4 block - load plaintext
   6763 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
   6764 
   6765 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
   6766 	movi	v16.8b, #0						//suppress further partial tag feed in
   6767 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
   6768 
   6769 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
   6770 .inst	0xce037529	//eor3 v9.16b, v9.16b, v3.16b, v29.16b			//AES final-4 block - result
   6771 .L256_enc_blocks_more_than_4:	//blocks	left >  4
   6772 
   6773 	st1	{ v9.16b}, [x2], #16				//AES final-4 block - store result
   6774 
   6775 	rev64	v8.16b, v9.16b						//GHASH final-4 block
   6776 
   6777 	ldr	q9, [x0], #16				//AES final-3 block - load plaintext
   6778 
   6779 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6780 
   6781 	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
   6782 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
   6783 
   6784 .inst	0xce047529	//eor3 v9.16b, v9.16b, v4.16b, v29.16b			//AES final-3 block - result
   6785 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
   6786 
   6787 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
   6788 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
   6789 
   6790 	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
   6791 
   6792 	movi	v16.8b, #0						//suppress further partial tag feed in
   6793 
   6794 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
   6795 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
   6796 .L256_enc_blocks_more_than_3:	//blocks	left >  3
   6797 
   6798 	st1	{ v9.16b}, [x2], #16				//AES final-3 block - store result
   6799 
   6800 	ldr	q25, [x3, #112]				//load h4l | h4h
   6801 	ext	v25.16b, v25.16b, v25.16b, #8
   6802 	rev64	v8.16b, v9.16b						//GHASH final-3 block
   6803 
   6804 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6805 
   6806 	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
   6807 	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
   6808 
   6809 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
   6810 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
   6811 	ldr	q24, [x3, #96]				//load h4k | h3k
   6812 
   6813 	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
   6814 	ldr	q9, [x0], #16				//AES final-2 block - load plaintext
   6815 
   6816 	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
   6817 	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
   6818 
   6819 .inst	0xce057529	//eor3 v9.16b, v9.16b, v5.16b, v29.16b			//AES final-2 block - result
   6820 	movi	v16.8b, #0						//suppress further partial tag feed in
   6821 
   6822 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
   6823 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
   6824 .L256_enc_blocks_more_than_2:	//blocks	left >  2
   6825 
   6826 	ldr	q23, [x3, #80]				//load h3l | h3h
   6827 	ext	v23.16b, v23.16b, v23.16b, #8
   6828 
   6829 	st1	{ v9.16b}, [x2], #16			 	//AES final-2 block - store result
   6830 
   6831 	rev64	v8.16b, v9.16b						//GHASH final-2 block
   6832 	ldr	q9, [x0], #16				//AES final-1 block - load plaintext
   6833 
   6834 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6835 
   6836 	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
   6837 
   6838 	movi	v16.8b, #0						//suppress further partial tag feed in
   6839 
   6840 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
   6841 .inst	0xce067529	//eor3 v9.16b, v9.16b, v6.16b, v29.16b			//AES final-1 block - result
   6842 
   6843 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
   6844 
   6845 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
   6846 
   6847 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
   6848 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
   6849 
   6850 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
   6851 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
   6852 .L256_enc_blocks_more_than_1:	//blocks	left >  1
   6853 
   6854 	st1	{ v9.16b}, [x2], #16				//AES final-1 block - store result
   6855 
   6856 	ldr	q22, [x3, #64]				//load h2l | h2h
   6857 	ext	v22.16b, v22.16b, v22.16b, #8
   6858 	rev64	v8.16b, v9.16b						//GHASH final-1 block
   6859 	ldr	q9, [x0], #16				//AES final block - load plaintext
   6860 
   6861 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6862 	movi	v16.8b, #0						//suppress further partial tag feed in
   6863 
   6864 	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
   6865 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
   6866 
   6867 .inst	0xce077529	//eor3 v9.16b, v9.16b, v7.16b, v29.16b			//AES final block - result
   6868 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
   6869 
   6870 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
   6871 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
   6872 
   6873 	ldr	q21, [x3, #48]				//load h2k | h1k
   6874 
   6875 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
   6876 	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
   6877 
   6878 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
   6879 
   6880 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
   6881 .L256_enc_blocks_less_than_1:	//blocks	left <= 1
   6882 
   6883 	and	x1, x1, #127				//bit_length %= 128
   6884 
   6885 	sub	x1, x1, #128				//bit_length -= 128
   6886 
   6887 	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
   6888 
   6889 	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
   6890 	and	x1, x1, #127				//bit_length %= 128
   6891 
   6892 	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
   6893 	cmp	x1, #64
   6894 	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
   6895 
   6896 	csel	x14, x6, xzr, lt
   6897 	csel	x13, x7, x6, lt
   6898 
   6899 	mov	v0.d[0], x13					//ctr0b is mask for last block
   6900 	ldr	q20, [x3, #32]				//load h1l | h1h
   6901 	ext	v20.16b, v20.16b, v20.16b, #8
   6902 
   6903 	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
   6904 	mov	v0.d[1], x14
   6905 
   6906 	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
   6907 
   6908 	rev64	v8.16b, v9.16b						//GHASH final block
   6909 
   6910 	rev32	v30.16b, v30.16b
   6911 	bif	v9.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
   6912 	str	q30, [x16]					//store the updated counter
   6913 
   6914 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   6915 	st1	{ v9.16b}, [x2]				//store all 16B
   6916 
   6917 	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
   6918 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
   6919 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
   6920 
   6921 	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
   6922 	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
   6923 
   6924 	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
   6925 
   6926 	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
   6927 
   6928 	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
   6929 	ldr	d16, [x10]			//MODULO - load modulo constant
   6930 
   6931 	ext	v21.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   6932 
   6933 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   6934 	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
   6935 
   6936 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   6937 
   6938 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   6939 	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   6940 
   6941 .inst	0xce115673	//eor3 v19.16b, v19.16b, v17.16b, v21.16b		 	//MODULO - fold into low
   6942 	ext	v19.16b, v19.16b, v19.16b, #8
   6943 	rev64	v19.16b, v19.16b
   6944 	st1	{ v19.16b }, [x3]
   6945 	mov	x0, x9					//return sizes
   6946 
   6947 	ldp	d10, d11, [sp, #16]
   6948 	ldp	d12, d13, [sp, #32]
   6949 	ldp	d14, d15, [sp, #48]
   6950 	ldp	d8, d9, [sp], #80
   6951 	ret
   6952 
   6953 .L256_enc_ret:
   6954 	mov	w0, #0x0
   6955 	ret
   6956 .size	unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
   6957 .globl	unroll8_eor3_aes_gcm_dec_256_kernel
   6958 .type	unroll8_eor3_aes_gcm_dec_256_kernel,%function
   6959 .align	4
   6960 unroll8_eor3_aes_gcm_dec_256_kernel:
   6961 	AARCH64_VALID_CALL_TARGET
   6962 	cbz	x1, .L256_dec_ret
   6963 	stp	d8, d9, [sp, #-80]!
   6964 	lsr	x9, x1, #3
   6965 	mov	x16, x4
   6966 	mov	x8, x5
   6967 	stp	d10, d11, [sp, #16]
   6968 	stp	d12, d13, [sp, #32]
   6969 	stp	d14, d15, [sp, #48]
   6970 	mov	x5, #0xc200000000000000
   6971 	stp	x5, xzr, [sp, #64]
   6972 	add	x10, sp, #64
   6973 
   6974 	ld1	{ v0.16b}, [x16]					//CTR block 0
   6975 
   6976 	mov	x15, #0x100000000			//set up counter increment
   6977 	movi	v31.16b, #0x0
   6978 	mov	v31.d[1], x15
   6979 	mov	x5, x9
   6980 
   6981 	sub	x5, x5, #1		//byte_len - 1
   6982 
   6983 	rev32	v30.16b, v0.16b				//set up reversed counter
   6984 
   6985 	add	v30.4s, v30.4s, v31.4s		//CTR block 0
   6986 
   6987 	rev32	v1.16b, v30.16b				//CTR block 1
   6988 	add	v30.4s, v30.4s, v31.4s		//CTR block 1
   6989 
   6990 	rev32	v2.16b, v30.16b				//CTR block 2
   6991 	add	v30.4s, v30.4s, v31.4s		//CTR block 2
   6992 	ldp	q26, q27, [x8, #0]				  	//load rk0, rk1
   6993 
   6994 	rev32	v3.16b, v30.16b				//CTR block 3
   6995 	add	v30.4s, v30.4s, v31.4s		//CTR block 3
   6996 
   6997 	rev32	v4.16b, v30.16b				//CTR block 4
   6998 	add	v30.4s, v30.4s, v31.4s		//CTR block 4
   6999 
   7000 	aese	v0.16b, v26.16b
   7001 	aesmc	v0.16b, v0.16b			//AES block 0 - round 0
   7002 
   7003 	rev32	v5.16b, v30.16b				//CTR block 5
   7004 	add	v30.4s, v30.4s, v31.4s		//CTR block 5
   7005 
   7006 	aese	v1.16b, v26.16b
   7007 	aesmc	v1.16b, v1.16b			//AES block 1 - round 0
   7008 	aese	v2.16b, v26.16b
   7009 	aesmc	v2.16b, v2.16b			//AES block 2 - round 0
   7010 
   7011 	rev32	v6.16b, v30.16b				//CTR block 6
   7012 	add	v30.4s, v30.4s, v31.4s		//CTR block 6
   7013 
   7014 	rev32	v7.16b, v30.16b				//CTR block 7
   7015 	aese	v4.16b, v26.16b
   7016 	aesmc	v4.16b, v4.16b			//AES block 4 - round 0
   7017 
   7018 	aese	v6.16b, v26.16b
   7019 	aesmc	v6.16b, v6.16b		        //AES block 6 - round 0
   7020 	aese	v5.16b, v26.16b
   7021 	aesmc	v5.16b, v5.16b			//AES block 5 - round 0
   7022 
   7023 	aese	v3.16b, v26.16b
   7024 	aesmc	v3.16b, v3.16b			//AES block 3 - round 0
   7025 	aese	v7.16b, v26.16b
   7026 	aesmc	v7.16b, v7.16b		        //AES block 7 - round 0
   7027 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   7028 
   7029 	aese	v6.16b, v27.16b
   7030 	aesmc	v6.16b, v6.16b		        //AES block 6 - round 1
   7031 	aese	v4.16b, v27.16b
   7032 	aesmc	v4.16b, v4.16b		        //AES block 4 - round 1
   7033 	aese	v0.16b, v27.16b
   7034 	aesmc	v0.16b, v0.16b		        //AES block 0 - round 1
   7035 
   7036 	aese	v5.16b, v27.16b
   7037 	aesmc	v5.16b, v5.16b			//AES block 5 - round 1
   7038 	aese	v7.16b, v27.16b
   7039 	aesmc	v7.16b, v7.16b			//AES block 7 - round 1
   7040 	aese	v1.16b, v27.16b
   7041 	aesmc	v1.16b, v1.16b			//AES block 1 - round 1
   7042 
   7043 	aese	v2.16b, v27.16b
   7044 	aesmc	v2.16b, v2.16b			//AES block 2 - round 1
   7045 	aese	v3.16b, v27.16b
   7046 	aesmc	v3.16b, v3.16b			//AES block 3 - round 1
   7047 
   7048 	aese	v3.16b, v28.16b
   7049 	aesmc	v3.16b, v3.16b			//AES block 3 - round 2
   7050 	aese	v2.16b, v28.16b
   7051 	aesmc	v2.16b, v2.16b			//AES block 2 - round 2
   7052 	aese	v6.16b, v28.16b
   7053 	aesmc	v6.16b, v6.16b			//AES block 6 - round 2
   7054 
   7055 	aese	v1.16b, v28.16b
   7056 	aesmc	v1.16b, v1.16b			//AES block 1 - round 2
   7057 	aese	v7.16b, v28.16b
   7058 	aesmc	v7.16b, v7.16b			//AES block 7 - round 2
   7059 	aese	v5.16b, v28.16b
   7060 	aesmc	v5.16b, v5.16b			//AES block 5 - round 2
   7061 
   7062 	aese	v0.16b, v28.16b
   7063 	aesmc	v0.16b, v0.16b			//AES block 0 - round 2
   7064 	aese	v4.16b, v28.16b
   7065 	aesmc	v4.16b, v4.16b			//AES block 4 - round 2
   7066 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   7067 
   7068 	aese	v1.16b, v26.16b
   7069 	aesmc	v1.16b, v1.16b			//AES block 1 - round 3
   7070 	aese	v2.16b, v26.16b
   7071 	aesmc	v2.16b, v2.16b			//AES block 2 - round 3
   7072 
   7073 	aese	v3.16b, v26.16b
   7074 	aesmc	v3.16b, v3.16b			//AES block 3 - round 3
   7075 	aese	v4.16b, v26.16b
   7076 	aesmc	v4.16b, v4.16b			//AES block 4 - round 3
   7077 
   7078 	aese	v5.16b, v26.16b
   7079 	aesmc	v5.16b, v5.16b			//AES block 5 - round 3
   7080 	aese	v7.16b, v26.16b
   7081 	aesmc	v7.16b, v7.16b			//AES block 7 - round 3
   7082 	aese	v0.16b, v26.16b
   7083 	aesmc	v0.16b, v0.16b			//AES block 0 - round 3
   7084 
   7085 	aese	v6.16b, v26.16b
   7086 	aesmc	v6.16b, v6.16b			//AES block 6 - round 3
   7087 
   7088 	aese	v7.16b, v27.16b
   7089 	aesmc	v7.16b, v7.16b			//AES block 7 - round 4
   7090 	aese	v3.16b, v27.16b
   7091 	aesmc	v3.16b, v3.16b			//AES block 3 - round 4
   7092 
   7093 	aese	v6.16b, v27.16b
   7094 	aesmc	v6.16b, v6.16b			//AES block 6 - round 4
   7095 	aese	v2.16b, v27.16b
   7096 	aesmc	v2.16b, v2.16b			//AES block 2 - round 4
   7097 	aese	v0.16b, v27.16b
   7098 	aesmc	v0.16b, v0.16b			//AES block 0 - round 4
   7099 
   7100 	aese	v4.16b, v27.16b
   7101 	aesmc	v4.16b, v4.16b			//AES block 4 - round 4
   7102 	aese	v1.16b, v27.16b
   7103 	aesmc	v1.16b, v1.16b			//AES block 1 - round 4
   7104 	aese	v5.16b, v27.16b
   7105 	aesmc	v5.16b, v5.16b			//AES block 5 - round 4
   7106 
   7107 	aese	v0.16b, v28.16b
   7108 	aesmc	v0.16b, v0.16b			//AES block 0 - round 5
   7109 	aese	v6.16b, v28.16b
   7110 	aesmc	v6.16b, v6.16b			//AES block 6 - round 5
   7111 
   7112 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   7113 	aese	v4.16b, v28.16b
   7114 	aesmc	v4.16b, v4.16b			//AES block 4 - round 5
   7115 	aese	v7.16b, v28.16b
   7116 	aesmc	v7.16b, v7.16b			//AES block 7 - round 5
   7117 
   7118 	aese	v5.16b, v28.16b
   7119 	aesmc	v5.16b, v5.16b			//AES block 5 - round 5
   7120 
   7121 	aese	v2.16b, v28.16b
   7122 	aesmc	v2.16b, v2.16b			//AES block 2 - round 5
   7123 	aese	v3.16b, v28.16b
   7124 	aesmc	v3.16b, v3.16b			//AES block 3 - round 5
   7125 
   7126 	aese	v1.16b, v28.16b
   7127 	aesmc	v1.16b, v1.16b			//AES block 1 - round 5
   7128 
   7129 	aese	v4.16b, v26.16b
   7130 	aesmc	v4.16b, v4.16b			//AES block 4 - round 6
   7131 	aese	v3.16b, v26.16b
   7132 	aesmc	v3.16b, v3.16b			//AES block 3 - round 6
   7133 	aese	v7.16b, v26.16b
   7134 	aesmc	v7.16b, v7.16b			//AES block 7 - round 6
   7135 
   7136 	aese	v6.16b, v26.16b
   7137 	aesmc	v6.16b, v6.16b			//AES block 6 - round 6
   7138 	aese	v0.16b, v26.16b
   7139 	aesmc	v0.16b, v0.16b			//AES block 0 - round 6
   7140 	aese	v5.16b, v26.16b
   7141 	aesmc	v5.16b, v5.16b			//AES block 5 - round 6
   7142 
   7143 	aese	v2.16b, v26.16b
   7144 	aesmc	v2.16b, v2.16b			//AES block 2 - round 6
   7145 	aese	v1.16b, v26.16b
   7146 	aesmc	v1.16b, v1.16b			//AES block 1 - round 6
   7147 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   7148 
   7149 	aese	v5.16b, v27.16b
   7150 	aesmc	v5.16b, v5.16b			//AES block 5 - round 7
   7151 	aese	v0.16b, v27.16b
   7152 	aesmc	v0.16b, v0.16b			//AES block 0 - round 7
   7153 
   7154 	aese	v3.16b, v27.16b
   7155 	aesmc	v3.16b, v3.16b			//AES block 3 - round 7
   7156 	aese	v2.16b, v27.16b
   7157 	aesmc	v2.16b, v2.16b			//AES block 2 - round 7
   7158 	aese	v7.16b, v27.16b
   7159 	aesmc	v7.16b, v7.16b			//AES block 7 - round 7
   7160 
   7161 	aese	v4.16b, v27.16b
   7162 	aesmc	v4.16b, v4.16b			//AES block 4 - round 7
   7163 	aese	v1.16b, v27.16b
   7164 	aesmc	v1.16b, v1.16b			//AES block 1 - round 7
   7165 	aese	v6.16b, v27.16b
   7166 	aesmc	v6.16b, v6.16b			//AES block 6 - round 7
   7167 
   7168 	and	x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   7169 	aese	v7.16b, v28.16b
   7170 	aesmc	v7.16b, v7.16b			//AES block 7 - round 8
   7171 	aese	v5.16b, v28.16b
   7172 	aesmc	v5.16b, v5.16b			//AES block 5 - round 8
   7173 
   7174 	aese	v0.16b, v28.16b
   7175 	aesmc	v0.16b, v0.16b			//AES block 0 - round 8
   7176 	aese	v1.16b, v28.16b
   7177 	aesmc	v1.16b, v1.16b			//AES block 1 - round 8
   7178 	aese	v2.16b, v28.16b
   7179 	aesmc	v2.16b, v2.16b			//AES block 2 - round 8
   7180 
   7181 	aese	v4.16b, v28.16b
   7182 	aesmc	v4.16b, v4.16b			//AES block 4 - round 8
   7183 	aese	v3.16b, v28.16b
   7184 	aesmc	v3.16b, v3.16b			//AES block 3 - round 8
   7185 	aese	v6.16b, v28.16b
   7186 	aesmc	v6.16b, v6.16b			//AES block 6 - round 8
   7187 
   7188 	aese	v2.16b, v26.16b
   7189 	aesmc	v2.16b, v2.16b			//AES block 2 - round 9
   7190 
   7191 	ld1	{ v19.16b}, [x3]
   7192 	ext	v19.16b, v19.16b, v19.16b, #8
   7193 	rev64	v19.16b, v19.16b
   7194 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   7195 	add	x4, x0, x1, lsr #3 //end_input_ptr
   7196 	add	x5, x5, x0
   7197 
   7198 	aese	v3.16b, v26.16b
   7199 	aesmc	v3.16b, v3.16b			//AES block 3 - round 9
   7200 	aese	v6.16b, v26.16b
   7201 	aesmc	v6.16b, v6.16b			//AES block 6 - round 9
   7202 
   7203 	aese	v4.16b, v26.16b
   7204 	aesmc	v4.16b, v4.16b			//AES block 4 - round 9
   7205 	aese	v5.16b, v26.16b
   7206 	aesmc	v5.16b, v5.16b			//AES block 5 - round 9
   7207 
   7208 	aese	v7.16b, v26.16b
   7209 	aesmc	v7.16b, v7.16b			//AES block 7 - round 9
   7210 
   7211 	aese	v0.16b, v26.16b
   7212 	aesmc	v0.16b, v0.16b			//AES block 0 - round 9
   7213 	aese	v1.16b, v26.16b
   7214 	aesmc	v1.16b, v1.16b			//AES block 1 - round 9
   7215 
   7216 	aese	v4.16b, v27.16b
   7217 	aesmc	v4.16b, v4.16b			//AES block 4 - round 10
   7218 	aese	v7.16b, v27.16b
   7219 	aesmc	v7.16b, v7.16b			//AES block 7 - round 10
   7220 	aese	v5.16b, v27.16b
   7221 	aesmc	v5.16b, v5.16b			//AES block 5 - round 10
   7222 
   7223 	aese	v1.16b, v27.16b
   7224 	aesmc	v1.16b, v1.16b			//AES block 1 - round 10
   7225 	aese	v2.16b, v27.16b
   7226 	aesmc	v2.16b, v2.16b			//AES block 2 - round 10
   7227 	aese	v0.16b, v27.16b
   7228 	aesmc	v0.16b, v0.16b			//AES block 0 - round 10
   7229 
   7230 	aese	v6.16b, v27.16b
   7231 	aesmc	v6.16b, v6.16b			//AES block 6 - round 10
   7232 	aese	v3.16b, v27.16b
   7233 	aesmc	v3.16b, v3.16b			//AES block 3 - round 10
   7234 	ldp	q26, q27, [x8, #192]				//load rk12, rk13
   7235 
   7236 	aese	v0.16b, v28.16b
   7237 	aesmc	v0.16b, v0.16b			//AES block 0 - round 11
   7238 	add	v30.4s, v30.4s, v31.4s //CTR block 7
   7239 
   7240 	aese	v7.16b, v28.16b
   7241 	aesmc	v7.16b, v7.16b			//AES block 7 - round 11
   7242 	aese	v3.16b, v28.16b
   7243 	aesmc	v3.16b, v3.16b			//AES block 3 - round 11
   7244 	aese	v1.16b, v28.16b
   7245 	aesmc	v1.16b, v1.16b			//AES block 1 - round 11
   7246 
   7247 	aese	v5.16b, v28.16b
   7248 	aesmc	v5.16b, v5.16b			//AES block 5 - round 11
   7249 	aese	v4.16b, v28.16b
   7250 	aesmc	v4.16b, v4.16b			//AES block 4 - round 11
   7251 	aese	v2.16b, v28.16b
   7252 	aesmc	v2.16b, v2.16b			//AES block 2 - round 11
   7253 
   7254 	aese	v6.16b, v28.16b
   7255 	aesmc	v6.16b, v6.16b			//AES block 6 - round 11
   7256 	ldr	q28, [x8, #224]					//load rk14
   7257 
   7258 	aese	v1.16b, v26.16b
   7259 	aesmc	v1.16b, v1.16b			//AES block 1 - round 12
   7260 	aese	v4.16b, v26.16b
   7261 	aesmc	v4.16b, v4.16b			//AES block 4 - round 12
   7262 	aese	v5.16b, v26.16b
   7263 	aesmc	v5.16b, v5.16b			//AES block 5 - round 12
   7264 
   7265 	cmp	x0, x5				//check if we have <= 8 blocks
   7266 	aese	v3.16b, v26.16b
   7267 	aesmc	v3.16b, v3.16b			//AES block 3 - round 12
   7268 	aese	v2.16b, v26.16b
   7269 	aesmc	v2.16b, v2.16b			//AES block 2 - round 12
   7270 
   7271 	aese	v6.16b, v26.16b
   7272 	aesmc	v6.16b, v6.16b			//AES block 6 - round 12
   7273 	aese	v0.16b, v26.16b
   7274 	aesmc	v0.16b, v0.16b			//AES block 0 - round 12
   7275 	aese	v7.16b, v26.16b
   7276 	aesmc	v7.16b, v7.16b			//AES block 7 - round 12
   7277 
   7278 	aese	v5.16b, v27.16b						//AES block 5 - round 13
   7279 	aese	v1.16b, v27.16b						//AES block 1 - round 13
   7280 	aese	v2.16b, v27.16b						//AES block 2 - round 13
   7281 
   7282 	aese	v0.16b, v27.16b						//AES block 0 - round 13
   7283 	aese	v4.16b, v27.16b						//AES block 4 - round 13
   7284 	aese	v6.16b, v27.16b						//AES block 6 - round 13
   7285 
   7286 	aese	v3.16b, v27.16b						//AES block 3 - round 13
   7287 	aese	v7.16b, v27.16b						//AES block 7 - round 13
   7288 	b.ge	.L256_dec_tail						//handle tail
   7289 
   7290 	ldp	q8, q9, [x0], #32			//AES block 0, 1 - load ciphertext
   7291 
   7292 	ldp	q10, q11, [x0], #32			//AES block 2, 3 - load ciphertext
   7293 
   7294 	ldp	q12, q13, [x0], #32			//AES block 4, 5 - load ciphertext
   7295 
   7296 	ldp	q14, q15, [x0], #32			//AES block 6, 7 - load ciphertext
   7297 	cmp	x0, x5				//check if we have <= 8 blocks
   7298 
   7299 .inst	0xce017121	//eor3 v1.16b, v9.16b, v1.16b, v28.16b				//AES block 1 - result
   7300 .inst	0xce007100	//eor3 v0.16b, v8.16b, v0.16b, v28.16b				//AES block 0 - result
   7301 	stp	q0, q1, [x2], #32			//AES block 0, 1 - store result
   7302 
   7303 	rev32	v0.16b, v30.16b				//CTR block 8
   7304 	add	v30.4s, v30.4s, v31.4s		//CTR block 8
   7305 .inst	0xce037163	//eor3 v3.16b, v11.16b, v3.16b, v28.16b				//AES block 3 - result
   7306 
   7307 .inst	0xce0571a5	//eor3 v5.16b, v13.16b, v5.16b, v28.16b				//AES block 5 - result
   7308 
   7309 .inst	0xce047184	//eor3 v4.16b, v12.16b, v4.16b, v28.16b				//AES block 4 - result
   7310 	rev32	v1.16b, v30.16b				//CTR block 9
   7311 	add	v30.4s, v30.4s, v31.4s		//CTR block 9
   7312 
   7313 .inst	0xce027142	//eor3 v2.16b, v10.16b, v2.16b, v28.16b				//AES block 2 - result
   7314 	stp	q2, q3, [x2], #32			//AES block 2, 3 - store result
   7315 
   7316 	rev32	v2.16b, v30.16b				//CTR block 10
   7317 	add	v30.4s, v30.4s, v31.4s		//CTR block 10
   7318 
   7319 .inst	0xce0671c6	//eor3 v6.16b, v14.16b, v6.16b, v28.16b				//AES block 6 - result
   7320 
   7321 	rev32	v3.16b, v30.16b				//CTR block 11
   7322 	add	v30.4s, v30.4s, v31.4s		//CTR block 11
   7323 	stp	q4, q5, [x2], #32			//AES block 4, 5 - store result
   7324 
   7325 .inst	0xce0771e7	//eor3 v7.16b, v15.16b, v7.16b, v28.16b				//AES block 7 - result
   7326 	stp	q6, q7, [x2], #32			//AES block 6, 7 - store result
   7327 
   7328 	rev32	v4.16b, v30.16b				//CTR block 12
   7329 	add	v30.4s, v30.4s, v31.4s		//CTR block 12
   7330 	b.ge	.L256_dec_prepretail					//do prepretail
   7331 
   7332 .L256_dec_main_loop:	//main	loop start
   7333 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   7334 	ldp	q26, q27, [x8, #0]					//load rk0, rk1
   7335 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   7336 
   7337 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   7338 	ldr	q23, [x3, #176]				//load h7l | h7h
   7339 	ext	v23.16b, v23.16b, v23.16b, #8
   7340 	ldr	q25, [x3, #208]				//load h8l | h8h
   7341 	ext	v25.16b, v25.16b, v25.16b, #8
   7342 
   7343 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   7344 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   7345 	rev64	v8.16b, v8.16b						//GHASH block 8k
   7346 
   7347 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   7348 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   7349 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   7350 
   7351 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   7352 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   7353 
   7354 	aese	v3.16b, v26.16b
   7355 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   7356 	aese	v6.16b, v26.16b
   7357 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   7358 	aese	v2.16b, v26.16b
   7359 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   7360 
   7361 	aese	v7.16b, v26.16b
   7362 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   7363 	aese	v0.16b, v26.16b
   7364 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   7365 	aese	v5.16b, v26.16b
   7366 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   7367 
   7368 	aese	v4.16b, v26.16b
   7369 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   7370 	aese	v1.16b, v26.16b
   7371 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   7372 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   7373 
   7374 	eor	v8.16b, v8.16b, v19.16b					//PRE 1
   7375 	ldr	q20, [x3, #128]				//load h5l | h5h
   7376 	ext	v20.16b, v20.16b, v20.16b, #8
   7377 	ldr	q22, [x3, #160]				//load h6l | h6h
   7378 	ext	v22.16b, v22.16b, v22.16b, #8
   7379 	aese	v6.16b, v27.16b
   7380 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   7381 
   7382 	aese	v4.16b, v27.16b
   7383 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   7384 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   7385 	aese	v3.16b, v27.16b
   7386 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   7387 
   7388 	aese	v0.16b, v27.16b
   7389 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   7390 	aese	v5.16b, v27.16b
   7391 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   7392 	aese	v2.16b, v27.16b
   7393 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   7394 
   7395 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   7396 	aese	v7.16b, v27.16b
   7397 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   7398 	aese	v1.16b, v27.16b
   7399 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   7400 
   7401 	aese	v4.16b, v28.16b
   7402 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   7403 	aese	v0.16b, v28.16b
   7404 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   7405 	aese	v3.16b, v28.16b
   7406 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   7407 
   7408 	aese	v6.16b, v28.16b
   7409 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   7410 	aese	v7.16b, v28.16b
   7411 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   7412 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   7413 
   7414 	aese	v5.16b, v28.16b
   7415 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   7416 	aese	v2.16b, v28.16b
   7417 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   7418 	aese	v1.16b, v28.16b
   7419 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   7420 
   7421 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   7422 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   7423 	aese	v3.16b, v26.16b
   7424 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   7425 
   7426 	aese	v0.16b, v26.16b
   7427 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   7428 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   7429 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   7430 
   7431 	aese	v5.16b, v26.16b
   7432 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   7433 	aese	v6.16b, v26.16b
   7434 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   7435 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   7436 
   7437 	aese	v4.16b, v26.16b
   7438 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   7439 	aese	v1.16b, v26.16b
   7440 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   7441 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   7442 
   7443 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   7444 	aese	v2.16b, v26.16b
   7445 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   7446 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   7447 
   7448 	aese	v5.16b, v27.16b
   7449 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   7450 	aese	v7.16b, v26.16b
   7451 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   7452 	aese	v3.16b, v27.16b
   7453 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   7454 
   7455 	aese	v2.16b, v27.16b
   7456 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   7457 	aese	v0.16b, v27.16b
   7458 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   7459 	aese	v1.16b, v27.16b
   7460 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   7461 
   7462 	aese	v6.16b, v27.16b
   7463 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   7464 	aese	v7.16b, v27.16b
   7465 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   7466 	aese	v4.16b, v27.16b
   7467 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   7468 
   7469 	ldr	q21, [x3, #144]				//load h6k | h5k
   7470 	ldr	q24, [x3, #192]				//load h8k | h7k
   7471 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   7472 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   7473 
   7474 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   7475 	aese	v5.16b, v28.16b
   7476 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   7477 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   7478 
   7479 	aese	v0.16b, v28.16b
   7480 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   7481 	aese	v3.16b, v28.16b
   7482 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   7483 	aese	v7.16b, v28.16b
   7484 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   7485 
   7486 	aese	v1.16b, v28.16b
   7487 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   7488 	aese	v2.16b, v28.16b
   7489 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   7490 	aese	v6.16b, v28.16b
   7491 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   7492 
   7493 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   7494 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   7495 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   7496 
   7497 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   7498 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   7499 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   7500 
   7501 	aese	v3.16b, v26.16b
   7502 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   7503 	aese	v0.16b, v26.16b
   7504 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   7505 	aese	v4.16b, v28.16b
   7506 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   7507 
   7508 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   7509 	aese	v1.16b, v26.16b
   7510 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   7511 	aese	v6.16b, v26.16b
   7512 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   7513 
   7514 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   7515 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   7516 	aese	v4.16b, v26.16b
   7517 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   7518 
   7519 	aese	v2.16b, v26.16b
   7520 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   7521 	aese	v5.16b, v26.16b
   7522 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   7523 	aese	v7.16b, v26.16b
   7524 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   7525 
   7526 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   7527 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   7528 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   7529 
   7530 	ldr	q23, [x3, #80]				//load h3l | h3h
   7531 	ext	v23.16b, v23.16b, v23.16b, #8
   7532 	ldr	q25, [x3, #112]				//load h4l | h4h
   7533 	ext	v25.16b, v25.16b, v25.16b, #8
   7534 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   7535 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   7536 
   7537 	aese	v2.16b, v27.16b
   7538 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   7539 	aese	v5.16b, v27.16b
   7540 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   7541 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   7542 
   7543 	ldr	q20, [x3, #32]				//load h1l | h1h
   7544 	ext	v20.16b, v20.16b, v20.16b, #8
   7545 	ldr	q22, [x3, #64]				//load h2l | h2h
   7546 	ext	v22.16b, v22.16b, v22.16b, #8
   7547 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   7548 	aese	v7.16b, v27.16b
   7549 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   7550 
   7551 	aese	v1.16b, v27.16b
   7552 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   7553 	aese	v3.16b, v27.16b
   7554 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   7555 	aese	v6.16b, v27.16b
   7556 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   7557 
   7558 	ldr	q21, [x3, #48]				//load h2k | h1k
   7559 	ldr	q24, [x3, #96]				//load h4k | h3k
   7560 	aese	v0.16b, v27.16b
   7561 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   7562 	aese	v4.16b, v27.16b
   7563 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   7564 
   7565 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   7566 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   7567 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   7568 
   7569 	aese	v5.16b, v28.16b
   7570 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   7571 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   7572 	aese	v2.16b, v28.16b
   7573 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   7574 
   7575 	aese	v6.16b, v28.16b
   7576 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   7577 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   7578 	aese	v1.16b, v28.16b
   7579 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   7580 
   7581 	aese	v4.16b, v28.16b
   7582 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   7583 	aese	v0.16b, v28.16b
   7584 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   7585 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   7586 
   7587 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   7588 	aese	v3.16b, v28.16b
   7589 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   7590 	aese	v7.16b, v28.16b
   7591 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   7592 
   7593 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   7594 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   7595 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   7596 
   7597 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   7598 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   7599 	aese	v3.16b, v26.16b
   7600 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   7601 
   7602 	aese	v6.16b, v26.16b
   7603 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   7604 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   7605 	aese	v5.16b, v26.16b
   7606 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   7607 
   7608 	ldp	q8, q9, [x0], #32			//AES block 8k+8, 8k+9 - load ciphertext
   7609 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   7610 	aese	v7.16b, v26.16b
   7611 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   7612 
   7613 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   7614 	aese	v2.16b, v26.16b
   7615 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   7616 	aese	v1.16b, v26.16b
   7617 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   7618 
   7619 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   7620 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   7621 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   7622 
   7623 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   7624 	aese	v3.16b, v27.16b
   7625 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   7626 	aese	v6.16b, v27.16b
   7627 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   7628 
   7629 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   7630 	aese	v0.16b, v26.16b
   7631 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   7632 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   7633 
   7634 	aese	v4.16b, v26.16b
   7635 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   7636 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   7637 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   7638 
   7639 	aese	v2.16b, v27.16b
   7640 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   7641 	aese	v5.16b, v27.16b
   7642 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   7643 	aese	v7.16b, v27.16b
   7644 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   7645 
   7646 	aese	v1.16b, v27.16b
   7647 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   7648 	aese	v0.16b, v27.16b
   7649 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   7650 	aese	v4.16b, v27.16b
   7651 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   7652 
   7653 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   7654 	rev32	v20.16b, v30.16b					//CTR block 8k+16
   7655 	ldr	d16, [x10]			//MODULO - load modulo constant
   7656 
   7657 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+16
   7658 	aese	v1.16b, v28.16b
   7659 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
   7660 	ldp	q26, q27, [x8, #192]				//load rk12, rk13
   7661 
   7662 	aese	v0.16b, v28.16b
   7663 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
   7664 	aese	v6.16b, v28.16b
   7665 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
   7666 
   7667 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   7668 	rev32	v22.16b, v30.16b					//CTR block 8k+17
   7669 	aese	v2.16b, v28.16b
   7670 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
   7671 
   7672 	ldp	q10, q11, [x0], #32			//AES block 8k+10, 8k+11 - load ciphertext
   7673 	aese	v7.16b, v28.16b
   7674 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
   7675 	ext	v21.16b, v17.16b, v17.16b, #8				 //MODULO - other top alignment
   7676 
   7677 	aese	v5.16b, v28.16b
   7678 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
   7679 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+17
   7680 	aese	v3.16b, v28.16b
   7681 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
   7682 
   7683 	aese	v2.16b, v26.16b
   7684 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
   7685 	aese	v7.16b, v26.16b
   7686 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
   7687 	aese	v6.16b, v26.16b
   7688 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
   7689 
   7690 	rev32	v23.16b, v30.16b					//CTR block 8k+18
   7691 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+18
   7692 	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
   7693 
   7694 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   7695 	aese	v1.16b, v26.16b
   7696 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 12
   7697 	aese	v4.16b, v28.16b
   7698 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
   7699 
   7700 	ldr	q28, [x8, #224]					//load rk14
   7701 	aese	v5.16b, v26.16b
   7702 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
   7703 	aese	v3.16b, v26.16b
   7704 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
   7705 
   7706 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   7707 	aese	v0.16b, v26.16b
   7708 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
   7709 	aese	v4.16b, v26.16b
   7710 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
   7711 
   7712 	ldp	q12, q13, [x0], #32			//AES block 8k+12, 8k+13 - load ciphertext
   7713 	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
   7714 	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
   7715 
   7716 	ldp	q14, q15, [x0], #32			//AES block 8k+14, 8k+15 - load ciphertext
   7717 	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
   7718 	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
   7719 
   7720 	rev32	v25.16b, v30.16b					//CTR block 8k+19
   7721 .inst	0xce027142	//eor3 v2.16b, v10.16b, v2.16b, v28.16b				//AES block 8k+10 - result
   7722 .inst	0xce017121	//eor3 v1.16b, v9.16b, v1.16b, v28.16b				//AES block 8k+9 - result
   7723 
   7724 	ext	v21.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   7725 	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
   7726 
   7727 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+19
   7728 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   7729 	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
   7730 
   7731 .inst	0xce0571a5	//eor3 v5.16b, v13.16b, v5.16b, v28.16b				//AES block 8k+13 - result
   7732 .inst	0xce007100	//eor3 v0.16b, v8.16b, v0.16b, v28.16b				//AES block 8k+8 - result
   7733 	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
   7734 
   7735 	stp	q0, q1, [x2], #32			//AES block 8k+8, 8k+9 - store result
   7736 	mov	v0.16b, v20.16b					//CTR block 8k+16
   7737 .inst	0xce047184	//eor3 v4.16b, v12.16b, v4.16b, v28.16b				//AES block 8k+12 - result
   7738 
   7739 .inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
   7740 .inst	0xce037163	//eor3 v3.16b, v11.16b, v3.16b, v28.16b				//AES block 8k+11 - result
   7741 	stp	q2, q3, [x2], #32			//AES block 8k+10, 8k+11 - store result
   7742 
   7743 	mov	v3.16b, v25.16b					//CTR block 8k+19
   7744 	mov	v2.16b, v23.16b					//CTR block 8k+18
   7745 	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
   7746 
   7747 	mov	v1.16b, v22.16b					//CTR block 8k+17
   7748 	stp	q4, q5, [x2], #32			//AES block 8k+12, 8k+13 - store result
   7749 .inst	0xce0771e7	//eor3 v7.16b, v15.16b, v7.16b, v28.16b				//AES block 8k+15 - result
   7750 
   7751 .inst	0xce0671c6	//eor3 v6.16b, v14.16b, v6.16b, v28.16b				//AES block 8k+14 - result
   7752 	rev32	v4.16b, v30.16b				//CTR block 8k+20
   7753 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+20
   7754 
   7755 	cmp	x0, x5				//.LOOP CONTROL
   7756 	stp	q6, q7, [x2], #32			//AES block 8k+14, 8k+15 - store result
   7757 	b.lt	.L256_dec_main_loop
   7758 
   7759 .L256_dec_prepretail:	//PREPRETAIL
   7760 	ldp	q26, q27, [x8, #0]					//load rk0, rk1
   7761 	rev32	v5.16b, v30.16b				//CTR block 8k+13
   7762 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+13
   7763 
   7764 	rev64	v12.16b, v12.16b						//GHASH block 8k+4
   7765 	ldr	q21, [x3, #144]				//load h6k | h5k
   7766 	ldr	q24, [x3, #192]				//load h8k | h7k
   7767 
   7768 	rev32	v6.16b, v30.16b				//CTR block 8k+14
   7769 	rev64	v8.16b, v8.16b						//GHASH block 8k
   7770 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+14
   7771 
   7772 	ext	v19.16b, v19.16b, v19.16b, #8				//PRE 0
   7773 	ldr	q23, [x3, #176]				//load h7l | h7h
   7774 	ext	v23.16b, v23.16b, v23.16b, #8
   7775 	ldr	q25, [x3, #208]				//load h8l | h8h
   7776 	ext	v25.16b, v25.16b, v25.16b, #8
   7777 	rev64	v9.16b, v9.16b						//GHASH block 8k+1
   7778 
   7779 	rev32	v7.16b, v30.16b				//CTR block 8k+15
   7780 	rev64	v10.16b, v10.16b						//GHASH block 8k+2
   7781 	ldr	q20, [x3, #128]				//load h5l | h5h
   7782 	ext	v20.16b, v20.16b, v20.16b, #8
   7783 	ldr	q22, [x3, #160]				//load h6l | h6h
   7784 	ext	v22.16b, v22.16b, v22.16b, #8
   7785 
   7786 	aese	v0.16b, v26.16b
   7787 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 0
   7788 	aese	v1.16b, v26.16b
   7789 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 0
   7790 	aese	v4.16b, v26.16b
   7791 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 0
   7792 
   7793 	aese	v3.16b, v26.16b
   7794 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 0
   7795 	aese	v5.16b, v26.16b
   7796 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 0
   7797 	aese	v6.16b, v26.16b
   7798 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 0
   7799 
   7800 	aese	v4.16b, v27.16b
   7801 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 1
   7802 	aese	v7.16b, v26.16b
   7803 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 0
   7804 	aese	v2.16b, v26.16b
   7805 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 0
   7806 
   7807 	ldp	q28, q26, [x8, #32]				//load rk2, rk3
   7808 	aese	v0.16b, v27.16b
   7809 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 1
   7810 	eor	v8.16b, v8.16b, v19.16b					//PRE 1
   7811 
   7812 	aese	v7.16b, v27.16b
   7813 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 1
   7814 	aese	v6.16b, v27.16b
   7815 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 1
   7816 	aese	v2.16b, v27.16b
   7817 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 1
   7818 
   7819 	aese	v3.16b, v27.16b
   7820 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 1
   7821 	aese	v1.16b, v27.16b
   7822 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 1
   7823 	aese	v5.16b, v27.16b
   7824 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 1
   7825 
   7826 	pmull2	v16.1q, v9.2d, v23.2d				//GHASH block 8k+1 - high
   7827 	trn1	v18.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   7828 	pmull	v19.1q, v8.1d, v25.1d				//GHASH block 8k - low
   7829 
   7830 	rev64	v11.16b, v11.16b						//GHASH block 8k+3
   7831 	pmull	v23.1q, v9.1d, v23.1d				//GHASH block 8k+1 - low
   7832 
   7833 	aese	v5.16b, v28.16b
   7834 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 2
   7835 	aese	v7.16b, v28.16b
   7836 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 2
   7837 	aese	v1.16b, v28.16b
   7838 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 2
   7839 
   7840 	aese	v3.16b, v28.16b
   7841 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 2
   7842 	aese	v6.16b, v28.16b
   7843 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 2
   7844 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH block 8k - high
   7845 
   7846 	aese	v0.16b, v28.16b
   7847 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 2
   7848 	aese	v7.16b, v26.16b
   7849 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 3
   7850 
   7851 	aese	v5.16b, v26.16b
   7852 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 3
   7853 	rev64	v14.16b, v14.16b						//GHASH block 8k+6
   7854 
   7855 	aese	v0.16b, v26.16b
   7856 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 3
   7857 	aese	v2.16b, v28.16b
   7858 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 2
   7859 	aese	v6.16b, v26.16b
   7860 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 3
   7861 
   7862 	pmull2	v29.1q, v10.2d, v22.2d				//GHASH block 8k+2 - high
   7863 	trn2	v8.2d, v9.2d, v8.2d				//GHASH block 8k, 8k+1 - mid
   7864 	aese	v4.16b, v28.16b
   7865 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 2
   7866 
   7867 	ldp	q27, q28, [x8, #64]				//load rk4, rk5
   7868 	aese	v1.16b, v26.16b
   7869 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 3
   7870 	pmull2	v9.1q, v11.2d, v20.2d				//GHASH block 8k+3 - high
   7871 
   7872 	aese	v2.16b, v26.16b
   7873 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 3
   7874 	eor	v17.16b, v17.16b, v16.16b				//GHASH block 8k+1 - high
   7875 	eor	v8.16b, v8.16b, v18.16b			//GHASH block 8k, 8k+1 - mid
   7876 
   7877 	aese	v4.16b, v26.16b
   7878 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 3
   7879 	pmull	v22.1q, v10.1d, v22.1d				//GHASH block 8k+2 - low
   7880 	aese	v3.16b, v26.16b
   7881 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 3
   7882 
   7883 .inst	0xce1d2631	//eor3 v17.16b, v17.16b, v29.16b, v9.16b			//GHASH block 8k+2, 8k+3 - high
   7884 	trn1	v29.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   7885 	trn2	v10.2d, v11.2d, v10.2d				//GHASH block 8k+2, 8k+3 - mid
   7886 
   7887 	pmull2	v18.1q, v8.2d, v24.2d				//GHASH block 8k	- mid
   7888 	pmull	v20.1q, v11.1d, v20.1d				//GHASH block 8k+3 - low
   7889 	eor	v19.16b, v19.16b, v23.16b				//GHASH block 8k+1 - low
   7890 
   7891 	pmull	v24.1q, v8.1d, v24.1d				//GHASH block 8k+1 - mid
   7892 	aese	v5.16b, v27.16b
   7893 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 4
   7894 	aese	v0.16b, v27.16b
   7895 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 4
   7896 
   7897 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+2, 8k+3 - low
   7898 	ldr	q20, [x3, #32]				//load h1l | h1h
   7899 	ext	v20.16b, v20.16b, v20.16b, #8
   7900 	ldr	q22, [x3, #64]				//load h2l | h2h
   7901 	ext	v22.16b, v22.16b, v22.16b, #8
   7902 	aese	v7.16b, v27.16b
   7903 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 4
   7904 
   7905 	aese	v2.16b, v27.16b
   7906 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 4
   7907 	aese	v6.16b, v27.16b
   7908 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 4
   7909 	eor	v18.16b, v18.16b, v24.16b				//GHASH block 8k+1 - mid
   7910 
   7911 	eor	v10.16b, v10.16b, v29.16b				//GHASH block 8k+2, 8k+3 - mid
   7912 	aese	v7.16b, v28.16b
   7913 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 5
   7914 	aese	v1.16b, v27.16b
   7915 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 4
   7916 
   7917 	aese	v2.16b, v28.16b
   7918 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 5
   7919 	aese	v3.16b, v27.16b
   7920 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 4
   7921 	aese	v4.16b, v27.16b
   7922 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 4
   7923 
   7924 	aese	v1.16b, v28.16b
   7925 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 5
   7926 	pmull2	v29.1q, v10.2d, v21.2d				//GHASH block 8k+2 - mid
   7927 	aese	v6.16b, v28.16b
   7928 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 5
   7929 
   7930 	aese	v4.16b, v28.16b
   7931 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 5
   7932 	aese	v3.16b, v28.16b
   7933 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 5
   7934 	pmull	v21.1q, v10.1d, v21.1d				//GHASH block 8k+3 - mid
   7935 
   7936 	aese	v0.16b, v28.16b
   7937 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 5
   7938 	aese	v5.16b, v28.16b
   7939 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 5
   7940 	ldp	q26, q27, [x8, #96]				//load rk6, rk7
   7941 
   7942 	ldr	q23, [x3, #80]				//load h3l | h3h
   7943 	ext	v23.16b, v23.16b, v23.16b, #8
   7944 	ldr	q25, [x3, #112]				//load h4l | h4h
   7945 	ext	v25.16b, v25.16b, v25.16b, #8
   7946 	rev64	v15.16b, v15.16b						//GHASH block 8k+7
   7947 	rev64	v13.16b, v13.16b						//GHASH block 8k+5
   7948 
   7949 .inst	0xce157652	//eor3 v18.16b, v18.16b, v21.16b, v29.16b			//GHASH block 8k+2, 8k+3 - mid
   7950 
   7951 	trn1	v16.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   7952 
   7953 	aese	v0.16b, v26.16b
   7954 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 6
   7955 	ldr	q21, [x3, #48]				//load h2k | h1k
   7956 	ldr	q24, [x3, #96]				//load h4k | h3k
   7957 	aese	v6.16b, v26.16b
   7958 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 6
   7959 
   7960 	aese	v5.16b, v26.16b
   7961 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 6
   7962 	aese	v7.16b, v26.16b
   7963 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 6
   7964 
   7965 	pmull2	v8.1q, v12.2d, v25.2d				//GHASH block 8k+4 - high
   7966 	pmull2	v10.1q, v13.2d, v23.2d				//GHASH block 8k+5 - high
   7967 	pmull	v25.1q, v12.1d, v25.1d				//GHASH block 8k+4 - low
   7968 
   7969 	trn2	v12.2d, v13.2d, v12.2d				//GHASH block 8k+4, 8k+5 - mid
   7970 	pmull	v23.1q, v13.1d, v23.1d				//GHASH block 8k+5 - low
   7971 	trn1	v13.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   7972 
   7973 	aese	v7.16b, v27.16b
   7974 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 7
   7975 	pmull2	v11.1q, v14.2d, v22.2d				//GHASH block 8k+6 - high
   7976 	aese	v1.16b, v26.16b
   7977 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 6
   7978 
   7979 	aese	v2.16b, v26.16b
   7980 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 6
   7981 	aese	v3.16b, v26.16b
   7982 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 6
   7983 	aese	v4.16b, v26.16b
   7984 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 6
   7985 
   7986 	ldp	q28, q26, [x8, #128]				//load rk8, rk9
   7987 	pmull	v22.1q, v14.1d, v22.1d				//GHASH block 8k+6 - low
   7988 	aese	v5.16b, v27.16b
   7989 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 7
   7990 
   7991 	aese	v1.16b, v27.16b
   7992 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 7
   7993 	aese	v4.16b, v27.16b
   7994 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 7
   7995 
   7996 	aese	v6.16b, v27.16b
   7997 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 7
   7998 	aese	v2.16b, v27.16b
   7999 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 7
   8000 .inst	0xce082a31	//eor3 v17.16b, v17.16b, v8.16b, v10.16b			//GHASH block 8k+4, 8k+5 - high
   8001 
   8002 	aese	v0.16b, v27.16b
   8003 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 7
   8004 	trn2	v14.2d, v15.2d, v14.2d				//GHASH block 8k+6, 8k+7 - mid
   8005 	aese	v3.16b, v27.16b
   8006 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 7
   8007 
   8008 	aese	v0.16b, v28.16b
   8009 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 8
   8010 	aese	v7.16b, v28.16b
   8011 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 8
   8012 	aese	v4.16b, v28.16b
   8013 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 8
   8014 
   8015 	aese	v1.16b, v28.16b
   8016 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 8
   8017 	aese	v5.16b, v28.16b
   8018 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 8
   8019 	aese	v6.16b, v28.16b
   8020 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 8
   8021 
   8022 	aese	v3.16b, v28.16b
   8023 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 8
   8024 	aese	v4.16b, v26.16b
   8025 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 9
   8026 	eor	v12.16b, v12.16b, v16.16b				//GHASH block 8k+4, 8k+5 - mid
   8027 
   8028 	aese	v0.16b, v26.16b
   8029 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 9
   8030 	aese	v1.16b, v26.16b
   8031 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 9
   8032 	eor	v14.16b, v14.16b, v13.16b				//GHASH block 8k+6, 8k+7 - mid
   8033 
   8034 	aese	v6.16b, v26.16b
   8035 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 9
   8036 	aese	v7.16b, v26.16b
   8037 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 9
   8038 	pmull2	v16.1q, v12.2d, v24.2d				//GHASH block 8k+4 - mid
   8039 
   8040 	aese	v2.16b, v28.16b
   8041 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 8
   8042 	pmull	v24.1q, v12.1d, v24.1d				//GHASH block 8k+5 - mid
   8043 	pmull2	v12.1q, v15.2d, v20.2d				//GHASH block 8k+7 - high
   8044 
   8045 	pmull2	v13.1q, v14.2d, v21.2d				//GHASH block 8k+6 - mid
   8046 	pmull	v21.1q, v14.1d, v21.1d				//GHASH block 8k+7 - mid
   8047 	pmull	v20.1q, v15.1d, v20.1d				//GHASH block 8k+7 - low
   8048 
   8049 	ldp	q27, q28, [x8, #160]				//load rk10, rk11
   8050 .inst	0xce195e73	//eor3 v19.16b, v19.16b, v25.16b, v23.16b			//GHASH block 8k+4, 8k+5 - low
   8051 .inst	0xce184252	//eor3 v18.16b, v18.16b, v24.16b, v16.16b			//GHASH block 8k+4, 8k+5 - mid
   8052 
   8053 	aese	v2.16b, v26.16b
   8054 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 9
   8055 	aese	v3.16b, v26.16b
   8056 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 9
   8057 	aese	v5.16b, v26.16b
   8058 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 9
   8059 
   8060 .inst	0xce0b3231	//eor3 v17.16b, v17.16b, v11.16b, v12.16b			//GHASH block 8k+6, 8k+7 - high
   8061 .inst	0xce165273	//eor3 v19.16b, v19.16b, v22.16b, v20.16b			//GHASH block 8k+6, 8k+7 - low
   8062 	ldr	d16, [x10]			//MODULO - load modulo constant
   8063 
   8064 .inst	0xce153652	//eor3 v18.16b, v18.16b, v21.16b, v13.16b			//GHASH block 8k+6, 8k+7 - mid
   8065 
   8066 	aese	v4.16b, v27.16b
   8067 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 10
   8068 	aese	v6.16b, v27.16b
   8069 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 10
   8070 	aese	v5.16b, v27.16b
   8071 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 10
   8072 
   8073 	aese	v0.16b, v27.16b
   8074 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 10
   8075 	aese	v2.16b, v27.16b
   8076 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 10
   8077 	aese	v3.16b, v27.16b
   8078 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 10
   8079 
   8080 .inst	0xce114e52	//eor3 v18.16b, v18.16b, v17.16b, v19.16b		 	//MODULO - karatsuba tidy up
   8081 
   8082 	aese	v7.16b, v27.16b
   8083 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 10
   8084 	aese	v1.16b, v27.16b
   8085 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 10
   8086 	ldp	q26, q27, [x8, #192]				//load rk12, rk13
   8087 
   8088 	ext	v21.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   8089 
   8090 	aese	v2.16b, v28.16b
   8091 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 11
   8092 	aese	v1.16b, v28.16b
   8093 	aesmc	v1.16b, v1.16b			//AES block 8k+9 - round 11
   8094 	aese	v0.16b, v28.16b
   8095 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 11
   8096 
   8097 	pmull	v29.1q, v17.1d, v16.1d			//MODULO - top 64b align with mid
   8098 	aese	v3.16b, v28.16b
   8099 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 11
   8100 
   8101 	aese	v7.16b, v28.16b
   8102 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 11
   8103 	aese	v6.16b, v28.16b
   8104 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 11
   8105 	aese	v4.16b, v28.16b
   8106 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 11
   8107 
   8108 	aese	v5.16b, v28.16b
   8109 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 11
   8110 	aese	v3.16b, v26.16b
   8111 	aesmc	v3.16b, v3.16b			//AES block 8k+11 - round 12
   8112 
   8113 .inst	0xce1d5652	//eor3 v18.16b, v18.16b, v29.16b, v21.16b			//MODULO - fold into mid
   8114 
   8115 	aese	v3.16b, v27.16b						//AES block 8k+11 - round 13
   8116 	aese	v2.16b, v26.16b
   8117 	aesmc	v2.16b, v2.16b			//AES block 8k+10 - round 12
   8118 	aese	v6.16b, v26.16b
   8119 	aesmc	v6.16b, v6.16b			//AES block 8k+14 - round 12
   8120 
   8121 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   8122 	aese	v4.16b, v26.16b
   8123 	aesmc	v4.16b, v4.16b			//AES block 8k+12 - round 12
   8124 	aese	v7.16b, v26.16b
   8125 	aesmc	v7.16b, v7.16b			//AES block 8k+15 - round 12
   8126 
   8127 	aese	v0.16b, v26.16b
   8128 	aesmc	v0.16b, v0.16b			//AES block 8k+8 - round 12
   8129 	ldr	q28, [x8, #224]					//load rk14
   8130 	aese	v1.16b, v26.16b
   8131 	aesmc	v1.16b, v1.16b	        	//AES block 8k+9 - round 12
   8132 
   8133 	aese	v4.16b, v27.16b						//AES block 8k+12 - round 13
   8134 	ext	v21.16b, v18.16b, v18.16b, #8			 	//MODULO - other mid alignment
   8135 	aese	v5.16b, v26.16b
   8136 	aesmc	v5.16b, v5.16b			//AES block 8k+13 - round 12
   8137 
   8138 	aese	v6.16b, v27.16b						//AES block 8k+14 - round 13
   8139 	aese	v2.16b, v27.16b						//AES block 8k+10 - round 13
   8140 	aese	v1.16b, v27.16b						//AES block 8k+9 - round 13
   8141 
   8142 	aese	v5.16b, v27.16b						//AES block 8k+13 - round 13
   8143 .inst	0xce154673	//eor3 v19.16b, v19.16b, v21.16b, v17.16b		 	//MODULO - fold into low
   8144 	add	v30.4s, v30.4s, v31.4s		//CTR block 8k+15
   8145 
   8146 	aese	v7.16b, v27.16b						//AES block 8k+15 - round 13
   8147 	aese	v0.16b, v27.16b						//AES block 8k+8 - round 13
   8148 .L256_dec_tail:	//TAIL
   8149 
   8150 	ext	v16.16b, v19.16b, v19.16b, #8				//prepare final partial tag
   8151 	sub	x5, x4, x0		//main_end_input_ptr is number of bytes left to process
   8152 	cmp	x5, #112
   8153 
   8154 	ldr	q9, [x0], #16				//AES block 8k+8 - load ciphertext
   8155 
   8156 	ldp	q24, q25, [x3, #192]			//load h8k | h7k
   8157 	ext	v25.16b, v25.16b, v25.16b, #8
   8158 	mov	v29.16b, v28.16b
   8159 
   8160 	ldp	q20, q21, [x3, #128]			//load h5l | h5h
   8161 	ext	v20.16b, v20.16b, v20.16b, #8
   8162 
   8163 .inst	0xce00752c	//eor3 v12.16b, v9.16b, v0.16b, v29.16b				//AES block 8k+8 - result
   8164 	ldp	q22, q23, [x3, #160]			//load h6l | h6h
   8165 	ext	v22.16b, v22.16b, v22.16b, #8
   8166 	ext	v23.16b, v23.16b, v23.16b, #8
   8167 	b.gt	.L256_dec_blocks_more_than_7
   8168 
   8169 	mov	v7.16b, v6.16b
   8170 	sub	v30.4s, v30.4s, v31.4s
   8171 	mov	v6.16b, v5.16b
   8172 
   8173 	mov	v5.16b, v4.16b
   8174 	mov	v4.16b, v3.16b
   8175 	movi	v19.8b, #0
   8176 
   8177 	movi	v17.8b, #0
   8178 	movi	v18.8b, #0
   8179 	mov	v3.16b, v2.16b
   8180 
   8181 	cmp	x5, #96
   8182 	mov	v2.16b, v1.16b
   8183 	b.gt	.L256_dec_blocks_more_than_6
   8184 
   8185 	mov	v7.16b, v6.16b
   8186 	mov	v6.16b, v5.16b
   8187 
   8188 	mov	v5.16b, v4.16b
   8189 	cmp	x5, #80
   8190 	sub	v30.4s, v30.4s, v31.4s
   8191 
   8192 	mov	v4.16b, v3.16b
   8193 	mov	v3.16b, v1.16b
   8194 	b.gt	.L256_dec_blocks_more_than_5
   8195 
   8196 	cmp	x5, #64
   8197 	mov	v7.16b, v6.16b
   8198 	sub	v30.4s, v30.4s, v31.4s
   8199 
   8200 	mov	v6.16b, v5.16b
   8201 
   8202 	mov	v5.16b, v4.16b
   8203 	mov	v4.16b, v1.16b
   8204 	b.gt	.L256_dec_blocks_more_than_4
   8205 
   8206 	sub	v30.4s, v30.4s, v31.4s
   8207 	mov	v7.16b, v6.16b
   8208 	cmp	x5, #48
   8209 
   8210 	mov	v6.16b, v5.16b
   8211 	mov	v5.16b, v1.16b
   8212 	b.gt	.L256_dec_blocks_more_than_3
   8213 
   8214 	ldr	q24, [x3, #96]				//load h4k | h3k
   8215 	sub	v30.4s, v30.4s, v31.4s
   8216 	mov	v7.16b, v6.16b
   8217 
   8218 	cmp	x5, #32
   8219 	mov	v6.16b, v1.16b
   8220 	b.gt	.L256_dec_blocks_more_than_2
   8221 
   8222 	sub	v30.4s, v30.4s, v31.4s
   8223 
   8224 	mov	v7.16b, v1.16b
   8225 	cmp	x5, #16
   8226 	b.gt	.L256_dec_blocks_more_than_1
   8227 
   8228 	sub	v30.4s, v30.4s, v31.4s
   8229 	ldr	q21, [x3, #48]				//load h2k | h1k
   8230 	b	.L256_dec_blocks_less_than_1
   8231 .L256_dec_blocks_more_than_7:	//blocks	left >  7
   8232 	rev64	v8.16b, v9.16b						//GHASH final-7 block
   8233 	ldr	q9, [x0], #16				//AES final-6 block - load ciphertext
   8234 	st1	{ v12.16b}, [x2], #16				//AES final-7 block  - store result
   8235 
   8236 	ins	v18.d[0], v24.d[1]					//GHASH final-7 block - mid
   8237 
   8238 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8239 
   8240 	ins	v27.d[0], v8.d[1]					//GHASH final-7 block - mid
   8241 .inst	0xce01752c	//eor3 v12.16b, v9.16b, v1.16b, v29.16b				//AES final-6 block - result
   8242 
   8243 	pmull2	v17.1q, v8.2d, v25.2d				//GHASH final-7 block - high
   8244 
   8245 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-7 block - mid
   8246 	movi	v16.8b, #0						//suppress further partial tag feed in
   8247 
   8248 	pmull	v19.1q, v8.1d, v25.1d				//GHASH final-7 block - low
   8249 	pmull	v18.1q, v27.1d, v18.1d			 	//GHASH final-7 block - mid
   8250 .L256_dec_blocks_more_than_6:	//blocks	left >  6
   8251 
   8252 	rev64	v8.16b, v9.16b						//GHASH final-6 block
   8253 
   8254 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8255 	ldr	q9, [x0], #16				//AES final-5 block - load ciphertext
   8256 	movi	v16.8b, #0						//suppress further partial tag feed in
   8257 
   8258 	ins	v27.d[0], v8.d[1]					//GHASH final-6 block - mid
   8259 	st1	{ v12.16b}, [x2], #16				//AES final-6 block - store result
   8260 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-6 block - high
   8261 
   8262 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-6 block - low
   8263 
   8264 .inst	0xce02752c	//eor3 v12.16b, v9.16b, v2.16b, v29.16b				//AES final-5 block - result
   8265 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-6 block - low
   8266 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-6 block - mid
   8267 
   8268 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-6 block - mid
   8269 
   8270 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-6 block - mid
   8271 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-6 block - high
   8272 .L256_dec_blocks_more_than_5:	//blocks	left >  5
   8273 
   8274 	rev64	v8.16b, v9.16b						//GHASH final-5 block
   8275 
   8276 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8277 
   8278 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-5 block - high
   8279 	ins	v27.d[0], v8.d[1]					//GHASH final-5 block - mid
   8280 
   8281 	ldr	q9, [x0], #16				//AES final-4 block - load ciphertext
   8282 
   8283 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-5 block - mid
   8284 	st1	{ v12.16b}, [x2], #16			  	//AES final-5 block - store result
   8285 
   8286 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-5 block - low
   8287 	ins	v27.d[1], v27.d[0]					//GHASH final-5 block - mid
   8288 
   8289 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-5 block - mid
   8290 
   8291 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-5 block - high
   8292 .inst	0xce03752c	//eor3 v12.16b, v9.16b, v3.16b, v29.16b				//AES final-4 block - result
   8293 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-5 block - low
   8294 
   8295 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-5 block - mid
   8296 	movi	v16.8b, #0						//suppress further partial tag feed in
   8297 .L256_dec_blocks_more_than_4:	//blocks	left >  4
   8298 
   8299 	rev64	v8.16b, v9.16b						//GHASH final-4 block
   8300 
   8301 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8302 
   8303 	ins	v27.d[0], v8.d[1]					//GHASH final-4 block - mid
   8304 	ldr	q9, [x0], #16				//AES final-3 block - load ciphertext
   8305 
   8306 	movi	v16.8b, #0						//suppress further partial tag feed in
   8307 
   8308 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final-4 block - low
   8309 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final-4 block - high
   8310 
   8311 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-4 block - mid
   8312 
   8313 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-4 block - high
   8314 
   8315 	pmull	v27.1q, v27.1d, v21.1d				//GHASH final-4 block - mid
   8316 
   8317 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-4 block - low
   8318 	st1	{ v12.16b}, [x2], #16			 	//AES final-4 block - store result
   8319 
   8320 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-4 block - mid
   8321 .inst	0xce04752c	//eor3 v12.16b, v9.16b, v4.16b, v29.16b				//AES final-3 block - result
   8322 .L256_dec_blocks_more_than_3:	//blocks	left >  3
   8323 
   8324 	ldr	q25, [x3, #112]				//load h4l | h4h
   8325 	ext	v25.16b, v25.16b, v25.16b, #8
   8326 	rev64	v8.16b, v9.16b						//GHASH final-3 block
   8327 
   8328 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8329 	ldr	q9, [x0], #16				//AES final-2 block - load ciphertext
   8330 	ldr	q24, [x3, #96]				//load h4k | h3k
   8331 
   8332 	ins	v27.d[0], v8.d[1]					//GHASH final-3 block - mid
   8333 	st1	{ v12.16b}, [x2], #16			 	//AES final-3 block - store result
   8334 
   8335 .inst	0xce05752c	//eor3 v12.16b, v9.16b, v5.16b, v29.16b				//AES final-2 block - result
   8336 
   8337 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-3 block - mid
   8338 
   8339 	ins	v27.d[1], v27.d[0]					//GHASH final-3 block - mid
   8340 	pmull	v26.1q, v8.1d, v25.1d				//GHASH final-3 block - low
   8341 	pmull2	v28.1q, v8.2d, v25.2d				//GHASH final-3 block - high
   8342 
   8343 	movi	v16.8b, #0						//suppress further partial tag feed in
   8344 	pmull2	v27.1q, v27.2d, v24.2d				//GHASH final-3 block - mid
   8345 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-3 block - low
   8346 
   8347 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-3 block - high
   8348 
   8349 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-3 block - mid
   8350 .L256_dec_blocks_more_than_2:	//blocks	left >  2
   8351 
   8352 	rev64	v8.16b, v9.16b						//GHASH final-2 block
   8353 
   8354 	ldr	q23, [x3, #80]				//load h3l | h3h
   8355 	ext	v23.16b, v23.16b, v23.16b, #8
   8356 	ldr	q9, [x0], #16				//AES final-1 block - load ciphertext
   8357 
   8358 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8359 
   8360 	ins	v27.d[0], v8.d[1]					//GHASH final-2 block - mid
   8361 
   8362 	pmull	v26.1q, v8.1d, v23.1d				//GHASH final-2 block - low
   8363 	st1	{ v12.16b}, [x2], #16			  	//AES final-2 block - store result
   8364 .inst	0xce06752c	//eor3 v12.16b, v9.16b, v6.16b, v29.16b				//AES final-1 block - result
   8365 
   8366 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-2 block - mid
   8367 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-2 block - low
   8368 	movi	v16.8b, #0						//suppress further partial tag feed in
   8369 
   8370 	pmull	v27.1q, v27.1d, v24.1d				//GHASH final-2 block - mid
   8371 	pmull2	v28.1q, v8.2d, v23.2d				//GHASH final-2 block - high
   8372 
   8373 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-2 block - mid
   8374 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-2 block - high
   8375 .L256_dec_blocks_more_than_1:	//blocks	left >  1
   8376 
   8377 	rev64	v8.16b, v9.16b						//GHASH final-1 block
   8378 
   8379 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8380 
   8381 	ins	v27.d[0], v8.d[1]					//GHASH final-1 block - mid
   8382 	ldr	q22, [x3, #64]				//load h2l | h2h
   8383 	ext	v22.16b, v22.16b, v22.16b, #8
   8384 
   8385 	eor	v27.8b, v27.8b, v8.8b				//GHASH final-1 block - mid
   8386 	ldr	q9, [x0], #16				//AES final block - load ciphertext
   8387 	st1	{ v12.16b}, [x2], #16			 	//AES final-1 block - store result
   8388 
   8389 	ldr	q21, [x3, #48]				//load h2k | h1k
   8390 	pmull	v26.1q, v8.1d, v22.1d				//GHASH final-1 block - low
   8391 
   8392 	ins	v27.d[1], v27.d[0]					//GHASH final-1 block - mid
   8393 
   8394 	eor	v19.16b, v19.16b, v26.16b					//GHASH final-1 block - low
   8395 
   8396 .inst	0xce07752c	//eor3 v12.16b, v9.16b, v7.16b, v29.16b				//AES final block - result
   8397 	pmull2	v28.1q, v8.2d, v22.2d				//GHASH final-1 block - high
   8398 
   8399 	pmull2	v27.1q, v27.2d, v21.2d				//GHASH final-1 block - mid
   8400 
   8401 	movi	v16.8b, #0						//suppress further partial tag feed in
   8402 	eor	v17.16b, v17.16b, v28.16b					//GHASH final-1 block - high
   8403 
   8404 	eor	v18.16b, v18.16b, v27.16b				//GHASH final-1 block - mid
   8405 .L256_dec_blocks_less_than_1:	//blocks	left <= 1
   8406 
   8407 	ld1	{ v26.16b}, [x2]					//load existing bytes where the possibly partial last block is to be stored
   8408 	mvn	x6, xzr						//temp0_x = 0xffffffffffffffff
   8409 	and	x1, x1, #127				//bit_length %= 128
   8410 
   8411 	sub	x1, x1, #128				//bit_length -= 128
   8412 	rev32	v30.16b, v30.16b
   8413 	str	q30, [x16]					//store the updated counter
   8414 
   8415 	neg	x1, x1				//bit_length = 128 - #bits in input (in range [1,128])
   8416 
   8417 	and	x1, x1, #127			 	//bit_length %= 128
   8418 
   8419 	lsr	x6, x6, x1				//temp0_x is mask for top 64b of last block
   8420 	cmp	x1, #64
   8421 	mvn	x7, xzr						//temp1_x = 0xffffffffffffffff
   8422 
   8423 	csel	x14, x6, xzr, lt
   8424 	csel	x13, x7, x6, lt
   8425 
   8426 	mov	v0.d[0], x13					//ctr0b is mask for last block
   8427 	mov	v0.d[1], x14
   8428 
   8429 	and	v9.16b, v9.16b, v0.16b					//possibly partial last block has zeroes in highest bits
   8430 	ldr	q20, [x3, #32]				//load h1l | h1h
   8431 	ext	v20.16b, v20.16b, v20.16b, #8
   8432 	bif	v12.16b, v26.16b, v0.16b					//insert existing bytes in top end of result before storing
   8433 
   8434 	rev64	v8.16b, v9.16b						//GHASH final block
   8435 
   8436 	eor	v8.16b, v8.16b, v16.16b					//feed in partial tag
   8437 
   8438 	ins	v16.d[0], v8.d[1]					//GHASH final block - mid
   8439 	pmull2	v28.1q, v8.2d, v20.2d				//GHASH final block - high
   8440 
   8441 	eor	v16.8b, v16.8b, v8.8b				//GHASH final block - mid
   8442 
   8443 	pmull	v26.1q, v8.1d, v20.1d				//GHASH final block - low
   8444 	eor	v17.16b, v17.16b, v28.16b					//GHASH final block - high
   8445 
   8446 	pmull	v16.1q, v16.1d, v21.1d				//GHASH final block - mid
   8447 
   8448 	eor	v18.16b, v18.16b, v16.16b				//GHASH final block - mid
   8449 	ldr	d16, [x10]			//MODULO - load modulo constant
   8450 	eor	v19.16b, v19.16b, v26.16b					//GHASH final block - low
   8451 
   8452 	pmull	v21.1q, v17.1d, v16.1d		 	//MODULO - top 64b align with mid
   8453 	eor	v14.16b, v17.16b, v19.16b				//MODULO - karatsuba tidy up
   8454 
   8455 	ext	v17.16b, v17.16b, v17.16b, #8				//MODULO - other top alignment
   8456 	st1	{ v12.16b}, [x2]				//store all 16B
   8457 
   8458 	eor	v18.16b, v18.16b, v14.16b				//MODULO - karatsuba tidy up
   8459 
   8460 	eor	v21.16b, v17.16b, v21.16b				//MODULO - fold into mid
   8461 	eor	v18.16b, v18.16b, v21.16b				//MODULO - fold into mid
   8462 
   8463 	pmull	v17.1q, v18.1d, v16.1d			//MODULO - mid 64b align with low
   8464 
   8465 	ext	v18.16b, v18.16b, v18.16b, #8				//MODULO - other mid alignment
   8466 	eor	v19.16b, v19.16b, v17.16b				//MODULO - fold into low
   8467 
   8468 	eor	v19.16b, v19.16b, v18.16b				//MODULO - fold into low
   8469 	ext	v19.16b, v19.16b, v19.16b, #8
   8470 	rev64	v19.16b, v19.16b
   8471 	st1	{ v19.16b }, [x3]
   8472 	mov	x0, x9
   8473 
   8474 	ldp	d10, d11, [sp, #16]
   8475 	ldp	d12, d13, [sp, #32]
   8476 	ldp	d14, d15, [sp, #48]
   8477 	ldp	d8, d9, [sp], #80
   8478 	ret
   8479 
   8480 .L256_dec_ret:
   8481 	mov	w0, #0x0
   8482 	ret
   8483 .size	unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
   8484 .byte	65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
   8485 .align	2
   8486 .align	2
   8487 #endif
   8488