Home | History | Annotate | Line # | Download | only in aarch64
      1  1.1  christos #include "arm_arch.h"
      2  1.1  christos 
      3  1.1  christos #if __ARM_MAX_ARCH__>=8
      4  1.1  christos .arch	armv8-a+crypto
      5  1.1  christos .text
      6  1.1  christos .globl	aes_gcm_enc_128_kernel
      7  1.1  christos .type	aes_gcm_enc_128_kernel,%function
      8  1.1  christos .align	4
      9  1.1  christos aes_gcm_enc_128_kernel:
     10  1.2  christos 	AARCH64_VALID_CALL_TARGET
     11  1.1  christos 	cbz	x1, .L128_enc_ret
     12  1.1  christos 	stp	x19, x20, [sp, #-112]!
     13  1.1  christos 	mov	x16, x4
     14  1.1  christos 	mov	x8, x5
     15  1.1  christos 	stp	x21, x22, [sp, #16]
     16  1.1  christos 	stp	x23, x24, [sp, #32]
     17  1.1  christos 	stp	d8, d9, [sp, #48]
     18  1.1  christos 	stp	d10, d11, [sp, #64]
     19  1.1  christos 	stp	d12, d13, [sp, #80]
     20  1.1  christos 	stp	d14, d15, [sp, #96]
     21  1.1  christos 
     22  1.1  christos 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
     23  1.1  christos #ifdef __AARCH64EB__
     24  1.1  christos 	rev	x10, x10
     25  1.1  christos 	rev	x11, x11
     26  1.1  christos #endif
     27  1.1  christos 	ldp	x13, x14, [x8, #160]                     //load rk10
     28  1.1  christos #ifdef __AARCH64EB__
     29  1.1  christos 	ror	x13, x13, #32
     30  1.1  christos 	ror	x14, x14, #32
     31  1.1  christos #endif
     32  1.1  christos 	ld1	{v11.16b}, [x3]
     33  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
     34  1.1  christos 	rev64	v11.16b, v11.16b
     35  1.1  christos 	lsr	x5, x1, #3              //byte_len
     36  1.1  christos 	mov	x15, x5
     37  1.1  christos 
     38  1.1  christos 	ld1	{v18.4s}, [x8], #16								  //load rk0
     39  1.1  christos 	add	x4, x0, x1, lsr #3   //end_input_ptr
     40  1.1  christos 	sub	x5, x5, #1      //byte_len - 1
     41  1.1  christos 
     42  1.1  christos 	lsr	x12, x11, #32
     43  1.1  christos 	ldr	q15, [x3, #112]                        //load h4l | h4h
     44  1.1  christos #ifndef __AARCH64EB__
     45  1.1  christos 	ext	v15.16b, v15.16b, v15.16b, #8
     46  1.1  christos #endif
     47  1.1  christos 	fmov	d1, x10                               //CTR block 1
     48  1.1  christos 	rev	w12, w12                                //rev_ctr32
     49  1.1  christos 
     50  1.1  christos 	add	w12, w12, #1                            //increment rev_ctr32
     51  1.1  christos 	orr	w11, w11, w11
     52  1.1  christos 	ld1	{v19.4s}, [x8], #16								  //load rk1
     53  1.1  christos 
     54  1.1  christos 	rev	w9, w12                                 //CTR block 1
     55  1.1  christos 	add	w12, w12, #1                            //CTR block 1
     56  1.1  christos 	fmov	d3, x10                               //CTR block 3
     57  1.1  christos 
     58  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 1
     59  1.1  christos 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
     60  1.1  christos 
     61  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 1
     62  1.1  christos 	rev	w9, w12                                 //CTR block 2
     63  1.1  christos 
     64  1.1  christos 	fmov	d2, x10                               //CTR block 2
     65  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 2
     66  1.1  christos 	add	w12, w12, #1                            //CTR block 2
     67  1.1  christos 
     68  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 2
     69  1.1  christos 	rev	w9, w12                                 //CTR block 3
     70  1.1  christos 
     71  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 3
     72  1.1  christos 	ld1	{v20.4s}, [x8], #16								  //load rk2
     73  1.1  christos 
     74  1.1  christos 	add	w12, w12, #1                            //CTR block 3
     75  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 3
     76  1.1  christos 
     77  1.1  christos 	ldr	q14, [x3, #80]                         //load h3l | h3h
     78  1.1  christos #ifndef __AARCH64EB__
     79  1.1  christos 	ext	v14.16b, v14.16b, v14.16b, #8
     80  1.1  christos #endif
     81  1.1  christos 	aese	v1.16b, v18.16b
     82  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
     83  1.1  christos 	ld1	{v21.4s}, [x8], #16								  //load rk3
     84  1.1  christos 
     85  1.1  christos 	aese	v2.16b, v18.16b
     86  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
     87  1.1  christos 	ldr	q12, [x3, #32]                         //load h1l | h1h
     88  1.1  christos #ifndef __AARCH64EB__
     89  1.1  christos 	ext	v12.16b, v12.16b, v12.16b, #8
     90  1.1  christos #endif
     91  1.1  christos 
     92  1.1  christos 	aese	v0.16b, v18.16b
     93  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
     94  1.1  christos 	ld1	{v22.4s}, [x8], #16								  //load rk4
     95  1.1  christos 
     96  1.1  christos 	aese	v3.16b, v18.16b
     97  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
     98  1.1  christos 	ld1	{v23.4s}, [x8], #16								  //load rk5
     99  1.1  christos 
    100  1.1  christos 	aese	v2.16b, v19.16b
    101  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
    102  1.1  christos 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
    103  1.1  christos 
    104  1.1  christos 	aese	v0.16b, v19.16b
    105  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
    106  1.1  christos 	ld1	{v24.4s}, [x8], #16								  //load rk6
    107  1.1  christos 
    108  1.1  christos 	aese	v1.16b, v19.16b
    109  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
    110  1.1  christos 	ld1	{v25.4s}, [x8], #16								  //load rk7
    111  1.1  christos 
    112  1.1  christos 	aese	v3.16b, v19.16b
    113  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
    114  1.1  christos 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
    115  1.1  christos 
    116  1.1  christos 	aese	v0.16b, v20.16b
    117  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
    118  1.1  christos 	ld1	{v26.4s}, [x8], #16								  //load rk8
    119  1.1  christos 
    120  1.1  christos 	aese	v1.16b, v20.16b
    121  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
    122  1.1  christos 	ldr	q13, [x3, #64]                         //load h2l | h2h
    123  1.1  christos #ifndef __AARCH64EB__
    124  1.1  christos 	ext	v13.16b, v13.16b, v13.16b, #8
    125  1.1  christos #endif
    126  1.1  christos 
    127  1.1  christos 	aese	v3.16b, v20.16b
    128  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
    129  1.1  christos 
    130  1.1  christos 	aese	v2.16b, v20.16b
    131  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
    132  1.1  christos 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
    133  1.1  christos 
    134  1.1  christos 	aese	v0.16b, v21.16b
    135  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
    136  1.1  christos 
    137  1.1  christos 	aese	v1.16b, v21.16b
    138  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
    139  1.1  christos 
    140  1.1  christos 	aese	v2.16b, v21.16b
    141  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
    142  1.1  christos 	ld1	{v27.4s}, [x8], #16								  //load rk9
    143  1.1  christos 
    144  1.1  christos 	aese	v3.16b, v21.16b
    145  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
    146  1.1  christos 
    147  1.1  christos 	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
    148  1.1  christos 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
    149  1.1  christos 
    150  1.1  christos 	aese	v3.16b, v22.16b
    151  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
    152  1.1  christos 	add	x5, x5, x0
    153  1.1  christos 
    154  1.1  christos 	aese	v2.16b, v22.16b
    155  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
    156  1.1  christos 	cmp	x0, x5                   //check if we have <= 4 blocks
    157  1.1  christos 
    158  1.1  christos 	aese	v0.16b, v22.16b
    159  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
    160  1.1  christos 
    161  1.1  christos 	aese	v3.16b, v23.16b
    162  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
    163  1.1  christos 
    164  1.1  christos 	aese	v2.16b, v23.16b
    165  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
    166  1.1  christos 
    167  1.1  christos 	aese	v0.16b, v23.16b
    168  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
    169  1.1  christos 
    170  1.1  christos 	aese	v3.16b, v24.16b
    171  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
    172  1.1  christos 
    173  1.1  christos 	aese	v1.16b, v22.16b
    174  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
    175  1.1  christos 
    176  1.1  christos 	aese	v2.16b, v24.16b
    177  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
    178  1.1  christos 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
    179  1.1  christos 
    180  1.1  christos 	aese	v0.16b, v24.16b
    181  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
    182  1.1  christos 
    183  1.1  christos 	aese	v1.16b, v23.16b
    184  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
    185  1.1  christos 
    186  1.1  christos 	aese	v3.16b, v25.16b
    187  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
    188  1.1  christos 
    189  1.1  christos 	aese	v0.16b, v25.16b
    190  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
    191  1.1  christos 
    192  1.1  christos 	aese	v1.16b, v24.16b
    193  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
    194  1.1  christos 
    195  1.1  christos 	aese	v2.16b, v25.16b
    196  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
    197  1.1  christos 
    198  1.1  christos 	aese	v0.16b, v26.16b
    199  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
    200  1.1  christos 
    201  1.1  christos 	aese	v1.16b, v25.16b
    202  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
    203  1.1  christos 
    204  1.1  christos 	aese	v2.16b, v26.16b
    205  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
    206  1.1  christos 
    207  1.1  christos 	aese	v3.16b, v26.16b
    208  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
    209  1.1  christos 
    210  1.1  christos 	aese	v1.16b, v26.16b
    211  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
    212  1.1  christos 
    213  1.1  christos 	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
    214  1.1  christos 
    215  1.1  christos 	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
    216  1.1  christos 
    217  1.1  christos 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
    218  1.1  christos 
    219  1.1  christos 	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
    220  1.1  christos 
    221  1.1  christos 	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
    222  1.1  christos 	b.ge	.L128_enc_tail                                    //handle tail
    223  1.1  christos 
    224  1.1  christos 	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
    225  1.1  christos #ifdef __AARCH64EB__
    226  1.1  christos 	rev	x6, x6
    227  1.1  christos 	rev	x7, x7
    228  1.1  christos #endif
    229  1.1  christos 	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
    230  1.1  christos #ifdef __AARCH64EB__
    231  1.1  christos 	rev	x21, x21
    232  1.1  christos 	rev	x22, x22
    233  1.1  christos #endif
    234  1.1  christos 	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
    235  1.1  christos #ifdef __AARCH64EB__
    236  1.1  christos 	rev	x19, x19
    237  1.1  christos 	rev	x20, x20
    238  1.1  christos #endif
    239  1.1  christos 	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
    240  1.1  christos #ifdef __AARCH64EB__
    241  1.1  christos 	rev	x23, x23
    242  1.1  christos 	rev	x24, x24
    243  1.1  christos #endif
    244  1.1  christos 	eor	x6, x6, x13                     //AES block 0 - round 10 low
    245  1.1  christos 	eor	x7, x7, x14                     //AES block 0 - round 10 high
    246  1.1  christos 
    247  1.1  christos 	eor	x21, x21, x13                     //AES block 2 - round 10 low
    248  1.1  christos 	fmov	d4, x6                               //AES block 0 - mov low
    249  1.1  christos 
    250  1.1  christos 	eor	x19, x19, x13                     //AES block 1 - round 10 low
    251  1.1  christos 	eor	x22, x22, x14                     //AES block 2 - round 10 high
    252  1.1  christos 	fmov	v4.d[1], x7                           //AES block 0 - mov high
    253  1.1  christos 
    254  1.1  christos 	fmov	d5, x19                               //AES block 1 - mov low
    255  1.1  christos 	eor	x20, x20, x14                     //AES block 1 - round 10 high
    256  1.1  christos 
    257  1.1  christos 	eor	x23, x23, x13                     //AES block 3 - round 10 low
    258  1.1  christos 	fmov	v5.d[1], x20                           //AES block 1 - mov high
    259  1.1  christos 
    260  1.1  christos 	fmov	d6, x21                               //AES block 2 - mov low
    261  1.1  christos 	eor	x24, x24, x14                     //AES block 3 - round 10 high
    262  1.1  christos 	rev	w9, w12                                 //CTR block 4
    263  1.1  christos 
    264  1.1  christos 	fmov	v6.d[1], x22                           //AES block 2 - mov high
    265  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4
    266  1.1  christos 
    267  1.1  christos 	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
    268  1.1  christos 	fmov	d0, x10                               //CTR block 4
    269  1.1  christos 	add	w12, w12, #1                            //CTR block 4
    270  1.1  christos 
    271  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4
    272  1.1  christos 	rev	w9, w12                                 //CTR block 5
    273  1.1  christos 
    274  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
    275  1.1  christos 	fmov	d1, x10                               //CTR block 5
    276  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 5
    277  1.1  christos 
    278  1.1  christos 	add	w12, w12, #1                            //CTR block 5
    279  1.1  christos 	add	x0, x0, #64                       //AES input_ptr update
    280  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 5
    281  1.1  christos 
    282  1.1  christos 	fmov	d7, x23                               //AES block 3 - mov low
    283  1.1  christos 	rev	w9, w12                                 //CTR block 6
    284  1.1  christos 	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
    285  1.1  christos 
    286  1.1  christos 	fmov	v7.d[1], x24                           //AES block 3 - mov high
    287  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 6
    288  1.1  christos 
    289  1.1  christos 	add	w12, w12, #1                            //CTR block 6
    290  1.1  christos 	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
    291  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
    292  1.1  christos 
    293  1.1  christos 	fmov	d2, x10                               //CTR block 6
    294  1.1  christos 	cmp	x0, x5                   //check if we have <= 8 blocks
    295  1.1  christos 
    296  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 6
    297  1.1  christos 	rev	w9, w12                                 //CTR block 7
    298  1.1  christos 	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
    299  1.1  christos 
    300  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 7
    301  1.1  christos 
    302  1.1  christos 	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
    303  1.1  christos 	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
    304  1.1  christos 	b.ge	.L128_enc_prepretail                              //do prepretail
    305  1.1  christos 
    306  1.1  christos .L128_enc_main_loop:	//main	loop start
    307  1.1  christos 	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
    308  1.1  christos #ifdef __AARCH64EB__
    309  1.1  christos 	rev	x23, x23
    310  1.1  christos 	rev	x24, x24
    311  1.1  christos #endif
    312  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
    313  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
    314  1.1  christos 
    315  1.1  christos 	aese	v2.16b, v18.16b
    316  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
    317  1.1  christos 	fmov	d3, x10                               //CTR block 4k+3
    318  1.1  christos 
    319  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
    320  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
    321  1.1  christos 
    322  1.1  christos 	aese	v1.16b, v18.16b
    323  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
    324  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+3
    325  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+3
    326  1.1  christos 
    327  1.1  christos 	aese	v0.16b, v18.16b
    328  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
    329  1.1  christos 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
    330  1.1  christos 
    331  1.1  christos 	aese	v2.16b, v19.16b
    332  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
    333  1.1  christos 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
    334  1.1  christos 
    335  1.1  christos 	aese	v1.16b, v19.16b
    336  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
    337  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
    338  1.1  christos 
    339  1.1  christos 	aese	v3.16b, v18.16b
    340  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
    341  1.1  christos 	eor	x24, x24, x14                     //AES block 4k+3 - round 10 high
    342  1.1  christos 
    343  1.1  christos 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
    344  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
    345  1.1  christos 	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
    346  1.1  christos #ifdef __AARCH64EB__
    347  1.1  christos 	rev	x6, x6
    348  1.1  christos 	rev	x7, x7
    349  1.1  christos #endif
    350  1.1  christos 	aese	v0.16b, v19.16b
    351  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
    352  1.1  christos 	rev	w9, w12                                 //CTR block 4k+8
    353  1.1  christos 
    354  1.1  christos 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
    355  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
    356  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
    357  1.1  christos 
    358  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
    359  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+8
    360  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
    361  1.1  christos 
    362  1.1  christos 	aese	v0.16b, v20.16b
    363  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
    364  1.1  christos 
    365  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
    366  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
    367  1.1  christos 
    368  1.1  christos 	aese	v1.16b, v20.16b
    369  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
    370  1.1  christos 
    371  1.1  christos 	aese	v0.16b, v21.16b
    372  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
    373  1.1  christos 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
    374  1.1  christos 
    375  1.1  christos 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
    376  1.1  christos 
    377  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
    378  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
    379  1.1  christos 
    380  1.1  christos 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
    381  1.1  christos 
    382  1.1  christos 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
    383  1.1  christos 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
    384  1.1  christos 
    385  1.1  christos 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
    386  1.1  christos 	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
    387  1.1  christos 
    388  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
    389  1.1  christos 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
    390  1.1  christos 
    391  1.1  christos 	aese	v3.16b, v19.16b
    392  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
    393  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
    394  1.1  christos 
    395  1.1  christos 	aese	v2.16b, v20.16b
    396  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
    397  1.1  christos 	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
    398  1.1  christos 
    399  1.1  christos 	aese	v1.16b, v21.16b
    400  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
    401  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
    402  1.1  christos 
    403  1.1  christos 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
    404  1.1  christos 
    405  1.1  christos 	aese	v2.16b, v21.16b
    406  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
    407  1.1  christos 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
    408  1.1  christos 
    409  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
    410  1.1  christos 
    411  1.1  christos 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
    412  1.1  christos 	movi	v8.8b, #0xc2
    413  1.1  christos 
    414  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
    415  1.1  christos 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
    416  1.1  christos 
    417  1.1  christos 	aese	v1.16b, v22.16b
    418  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
    419  1.1  christos 
    420  1.1  christos 	aese	v3.16b, v20.16b
    421  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
    422  1.1  christos 	shl	d8, d8, #56               //mod_constant
    423  1.1  christos 
    424  1.1  christos 	aese	v0.16b, v22.16b
    425  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
    426  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
    427  1.1  christos 
    428  1.1  christos 	aese	v1.16b, v23.16b
    429  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
    430  1.1  christos 	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
    431  1.1  christos #ifdef __AARCH64EB__
    432  1.1  christos 	rev	x19, x19
    433  1.1  christos 	rev	x20, x20
    434  1.1  christos #endif
    435  1.1  christos 	aese	v3.16b, v21.16b
    436  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
    437  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
    438  1.1  christos 
    439  1.1  christos 	aese	v0.16b, v23.16b
    440  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
    441  1.1  christos 	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
    442  1.1  christos #ifdef __AARCH64EB__
    443  1.1  christos 	rev	x21, x21
    444  1.1  christos 	rev	x22, x22
    445  1.1  christos #endif
    446  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
    447  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
    448  1.1  christos 
    449  1.1  christos 	aese	v2.16b, v22.16b
    450  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
    451  1.1  christos 	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
    452  1.1  christos 
    453  1.1  christos 	aese	v3.16b, v22.16b
    454  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
    455  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
    456  1.1  christos 
    457  1.1  christos 	aese	v1.16b, v24.16b
    458  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
    459  1.1  christos 	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
    460  1.1  christos 
    461  1.1  christos 	aese	v2.16b, v23.16b
    462  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
    463  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
    464  1.1  christos 
    465  1.1  christos 	fmov	d4, x6                               //AES block 4k+4 - mov low
    466  1.1  christos 	aese	v0.16b, v24.16b
    467  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
    468  1.1  christos 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
    469  1.1  christos 
    470  1.1  christos 	add	x0, x0, #64                       //AES input_ptr update
    471  1.1  christos 	fmov	d7, x23                               //AES block 4k+3 - mov low
    472  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
    473  1.1  christos 
    474  1.1  christos 	aese	v3.16b, v23.16b
    475  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
    476  1.1  christos 	fmov	d5, x19                               //AES block 4k+5 - mov low
    477  1.1  christos 
    478  1.1  christos 	aese	v0.16b, v25.16b
    479  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
    480  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
    481  1.1  christos 
    482  1.1  christos 	aese	v2.16b, v24.16b
    483  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
    484  1.1  christos 	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
    485  1.1  christos 
    486  1.1  christos 	aese	v1.16b, v25.16b
    487  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
    488  1.1  christos 	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
    489  1.1  christos 
    490  1.1  christos 	aese	v0.16b, v26.16b
    491  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
    492  1.1  christos 	fmov	v7.d[1], x24                           //AES block 4k+3 - mov high
    493  1.1  christos 
    494  1.1  christos 	aese	v3.16b, v24.16b
    495  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
    496  1.1  christos 	cmp	x0, x5                   //.LOOP CONTROL
    497  1.1  christos 
    498  1.1  christos 	aese	v1.16b, v26.16b
    499  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
    500  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
    501  1.1  christos 
    502  1.1  christos 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
    503  1.1  christos 	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
    504  1.1  christos 	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
    505  1.1  christos 
    506  1.1  christos 	aese	v3.16b, v25.16b
    507  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
    508  1.1  christos 	fmov	d6, x21                               //AES block 4k+6 - mov low
    509  1.1  christos 
    510  1.1  christos 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
    511  1.1  christos 	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
    512  1.1  christos 
    513  1.1  christos 	aese	v2.16b, v25.16b
    514  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
    515  1.1  christos 	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
    516  1.1  christos 
    517  1.1  christos 	fmov	d0, x10                               //CTR block 4k+8
    518  1.1  christos 	aese	v3.16b, v26.16b
    519  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
    520  1.1  christos 
    521  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4k+8
    522  1.1  christos 	rev	w9, w12                                 //CTR block 4k+9
    523  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
    524  1.1  christos 
    525  1.1  christos 	aese	v2.16b, v26.16b
    526  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
    527  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
    528  1.1  christos 
    529  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+9
    530  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
    531  1.1  christos 	fmov	d1, x10                               //CTR block 4k+9
    532  1.1  christos 
    533  1.1  christos 	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
    534  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 4k+9
    535  1.1  christos 	rev	w9, w12                                 //CTR block 4k+10
    536  1.1  christos 
    537  1.1  christos 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
    538  1.1  christos 	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
    539  1.1  christos 	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
    540  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
    541  1.1  christos 
    542  1.1  christos 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
    543  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+10
    544  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
    545  1.1  christos 	fmov	d2, x10                               //CTR block 4k+10
    546  1.1  christos 
    547  1.1  christos 	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
    548  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
    549  1.1  christos 
    550  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+10
    551  1.1  christos 	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
    552  1.1  christos 	rev	w9, w12                                 //CTR block 4k+11
    553  1.1  christos 
    554  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
    555  1.1  christos 	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result
    556  1.1  christos 
    557  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
    558  1.1  christos 	st1	{ v7.16b}, [x2], #16                     //AES block 4k+3 - store result
    559  1.1  christos 	b.lt	.L128_enc_main_loop
    560  1.1  christos 
    561  1.1  christos .L128_enc_prepretail:	//PREPRETAIL
    562  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
    563  1.1  christos 	fmov	d3, x10                               //CTR block 4k+3
    564  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
    565  1.1  christos 
    566  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
    567  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+3
    568  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+3
    569  1.1  christos 
    570  1.1  christos 	aese	v1.16b, v18.16b
    571  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
    572  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
    573  1.1  christos 
    574  1.1  christos 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
    575  1.1  christos 
    576  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
    577  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
    578  1.1  christos 
    579  1.1  christos 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
    580  1.1  christos 
    581  1.1  christos 	aese	v3.16b, v18.16b
    582  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
    583  1.1  christos 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
    584  1.1  christos 
    585  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
    586  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
    587  1.1  christos 
    588  1.1  christos 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
    589  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
    590  1.1  christos 
    591  1.1  christos 	aese	v1.16b, v19.16b
    592  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
    593  1.1  christos 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
    594  1.1  christos 
    595  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
    596  1.1  christos 
    597  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
    598  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
    599  1.1  christos 
    600  1.1  christos 	aese	v3.16b, v19.16b
    601  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
    602  1.1  christos 
    603  1.1  christos 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
    604  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
    605  1.1  christos 
    606  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
    607  1.1  christos 
    608  1.1  christos 	aese	v0.16b, v18.16b
    609  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
    610  1.1  christos 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
    611  1.1  christos 
    612  1.1  christos 	aese	v2.16b, v18.16b
    613  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
    614  1.1  christos 
    615  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
    616  1.1  christos 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
    617  1.1  christos 
    618  1.1  christos 	aese	v0.16b, v19.16b
    619  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
    620  1.1  christos 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
    621  1.1  christos 
    622  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
    623  1.1  christos 
    624  1.1  christos 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
    625  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
    626  1.1  christos 
    627  1.1  christos 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
    628  1.1  christos 
    629  1.1  christos 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
    630  1.1  christos 
    631  1.1  christos 	aese	v2.16b, v19.16b
    632  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
    633  1.1  christos 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
    634  1.1  christos 
    635  1.1  christos 	aese	v0.16b, v20.16b
    636  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
    637  1.1  christos 
    638  1.1  christos 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
    639  1.1  christos 	movi	v8.8b, #0xc2
    640  1.1  christos 
    641  1.1  christos 	aese	v2.16b, v20.16b
    642  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
    643  1.1  christos 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
    644  1.1  christos 
    645  1.1  christos 	aese	v3.16b, v20.16b
    646  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
    647  1.1  christos 
    648  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
    649  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
    650  1.1  christos 
    651  1.1  christos 	aese	v2.16b, v21.16b
    652  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
    653  1.1  christos 
    654  1.1  christos 	aese	v1.16b, v20.16b
    655  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
    656  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
    657  1.1  christos 
    658  1.1  christos 	aese	v0.16b, v21.16b
    659  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
    660  1.1  christos 
    661  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
    662  1.1  christos 	shl	d8, d8, #56               //mod_constant
    663  1.1  christos 
    664  1.1  christos 	aese	v1.16b, v21.16b
    665  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
    666  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
    667  1.1  christos 
    668  1.1  christos 	aese	v0.16b, v22.16b
    669  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
    670  1.1  christos 
    671  1.1  christos 	pmull	v28.1q, v9.1d, v8.1d
    672  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
    673  1.1  christos 
    674  1.1  christos 	aese	v1.16b, v22.16b
    675  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
    676  1.1  christos 
    677  1.1  christos 	aese	v0.16b, v23.16b
    678  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
    679  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8
    680  1.1  christos 
    681  1.1  christos 	aese	v3.16b, v21.16b
    682  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
    683  1.1  christos 
    684  1.1  christos 	aese	v2.16b, v22.16b
    685  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
    686  1.1  christos 	eor	v10.16b, v10.16b, v11.16b
    687  1.1  christos 
    688  1.1  christos 	aese	v0.16b, v24.16b
    689  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
    690  1.1  christos 
    691  1.1  christos 	aese	v3.16b, v22.16b
    692  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
    693  1.1  christos 
    694  1.1  christos 	aese	v1.16b, v23.16b
    695  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
    696  1.1  christos 
    697  1.1  christos 	aese	v2.16b, v23.16b
    698  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
    699  1.1  christos 	eor	v10.16b, v10.16b, v28.16b
    700  1.1  christos 
    701  1.1  christos 	aese	v3.16b, v23.16b
    702  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
    703  1.1  christos 
    704  1.1  christos 	aese	v1.16b, v24.16b
    705  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
    706  1.1  christos 
    707  1.1  christos 	aese	v2.16b, v24.16b
    708  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
    709  1.1  christos 
    710  1.1  christos 	aese	v3.16b, v24.16b
    711  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
    712  1.1  christos 	eor	v10.16b, v10.16b, v9.16b
    713  1.1  christos 
    714  1.1  christos 	aese	v0.16b, v25.16b
    715  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
    716  1.1  christos 
    717  1.1  christos 	aese	v2.16b, v25.16b
    718  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
    719  1.1  christos 
    720  1.1  christos 	aese	v3.16b, v25.16b
    721  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
    722  1.1  christos 
    723  1.1  christos 	pmull	v28.1q, v10.1d, v8.1d
    724  1.1  christos 
    725  1.1  christos 	aese	v1.16b, v25.16b
    726  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
    727  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8
    728  1.1  christos 
    729  1.1  christos 	aese	v3.16b, v26.16b
    730  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
    731  1.1  christos 
    732  1.1  christos 	aese	v0.16b, v26.16b
    733  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
    734  1.1  christos 	eor	v11.16b, v11.16b, v28.16b
    735  1.1  christos 
    736  1.1  christos 	aese	v1.16b, v26.16b
    737  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
    738  1.1  christos 
    739  1.1  christos 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
    740  1.1  christos 
    741  1.1  christos 	aese	v2.16b, v26.16b
    742  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
    743  1.1  christos 
    744  1.1  christos 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
    745  1.1  christos 
    746  1.1  christos 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
    747  1.1  christos 	eor	v11.16b, v11.16b, v10.16b
    748  1.1  christos 
    749  1.1  christos 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
    750  1.1  christos .L128_enc_tail:	//TAIL
    751  1.1  christos 
    752  1.1  christos 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
    753  1.1  christos 	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
    754  1.1  christos #ifdef __AARCH64EB__
    755  1.1  christos 	rev	x6, x6
    756  1.1  christos 	rev	x7, x7
    757  1.1  christos #endif
    758  1.1  christos 	cmp	x5, #48
    759  1.1  christos 
    760  1.1  christos 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
    761  1.1  christos 	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
    762  1.1  christos 	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
    763  1.1  christos 
    764  1.1  christos 	fmov	d4, x6                               //AES block 4k+4 - mov low
    765  1.1  christos 
    766  1.1  christos 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
    767  1.1  christos 
    768  1.1  christos 	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
    769  1.1  christos 
    770  1.1  christos 	b.gt	.L128_enc_blocks_more_than_3
    771  1.1  christos 
    772  1.1  christos 	sub	w12, w12, #1
    773  1.1  christos 	movi	v11.8b, #0
    774  1.1  christos 	mov	v3.16b, v2.16b
    775  1.1  christos 
    776  1.1  christos 	cmp	x5, #32
    777  1.1  christos 	mov	v2.16b, v1.16b
    778  1.1  christos 	movi	v9.8b, #0
    779  1.1  christos 
    780  1.1  christos 	movi	v10.8b, #0
    781  1.1  christos 	b.gt	.L128_enc_blocks_more_than_2
    782  1.1  christos 
    783  1.1  christos 	mov	v3.16b, v1.16b
    784  1.1  christos 	cmp	x5, #16
    785  1.1  christos 
    786  1.1  christos 	sub	w12, w12, #1
    787  1.1  christos 	b.gt	.L128_enc_blocks_more_than_1
    788  1.1  christos 
    789  1.1  christos 	sub	w12, w12, #1
    790  1.1  christos 	b	.L128_enc_blocks_less_than_1
    791  1.1  christos .L128_enc_blocks_more_than_3:	//blocks	left >  3
    792  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES final-3 block  - store result
    793  1.1  christos 
    794  1.1  christos 	ldp	x6, x7, [x0], #16           //AES final-2 block - load input low & high
    795  1.1  christos #ifdef __AARCH64EB__
    796  1.1  christos 	rev	x6, x6
    797  1.1  christos 	rev	x7, x7
    798  1.1  christos #endif
    799  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
    800  1.1  christos 
    801  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    802  1.1  christos 	eor	x7, x7, x14                     //AES final-2 block - round 10 high
    803  1.1  christos 	eor	x6, x6, x13                     //AES final-2 block - round 10 low
    804  1.1  christos 
    805  1.1  christos 	fmov	d5, x6                                 //AES final-2 block - mov low
    806  1.1  christos 
    807  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
    808  1.1  christos 	fmov	v5.d[1], x7                             //AES final-2 block - mov high
    809  1.1  christos 
    810  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
    811  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
    812  1.1  christos 
    813  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
    814  1.1  christos 
    815  1.1  christos 	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
    816  1.1  christos 
    817  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                            //AES final-2 block - result
    818  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
    819  1.1  christos 
    820  1.1  christos 	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
    821  1.1  christos .L128_enc_blocks_more_than_2:	//blocks	left >  2
    822  1.1  christos 
    823  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES final-2 block - store result
    824  1.1  christos 
    825  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
    826  1.1  christos 	ldp	x6, x7, [x0], #16           //AES final-1 block - load input low & high
    827  1.1  christos #ifdef __AARCH64EB__
    828  1.1  christos 	rev	x6, x6
    829  1.1  christos 	rev	x7, x7
    830  1.1  christos #endif
    831  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    832  1.1  christos 
    833  1.1  christos 	eor	x6, x6, x13                     //AES final-1 block - round 10 low
    834  1.1  christos 
    835  1.1  christos 	fmov	d5, x6                                 //AES final-1 block - mov low
    836  1.1  christos 	eor	x7, x7, x14                     //AES final-1 block - round 10 high
    837  1.1  christos 
    838  1.1  christos 	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
    839  1.1  christos 	fmov	v5.d[1], x7                             //AES final-1 block - mov high
    840  1.1  christos 
    841  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
    842  1.1  christos 
    843  1.1  christos 	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
    844  1.1  christos 
    845  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
    846  1.1  christos 
    847  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
    848  1.1  christos 
    849  1.1  christos 	eor	v5.16b, v5.16b, v2.16b                            //AES final-1 block - result
    850  1.1  christos 
    851  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
    852  1.1  christos 
    853  1.1  christos 	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
    854  1.1  christos 
    855  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
    856  1.1  christos 
    857  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
    858  1.1  christos .L128_enc_blocks_more_than_1:	//blocks	left >  1
    859  1.1  christos 
    860  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES final-1 block - store result
    861  1.1  christos 
    862  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
    863  1.1  christos 	ldp	x6, x7, [x0], #16           //AES final block - load input low & high
    864  1.1  christos #ifdef __AARCH64EB__
    865  1.1  christos 	rev	x6, x6
    866  1.1  christos 	rev	x7, x7
    867  1.1  christos #endif
    868  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    869  1.1  christos 
    870  1.1  christos 	eor	x7, x7, x14                     //AES final block - round 10 high
    871  1.1  christos 	eor	x6, x6, x13                     //AES final block - round 10 low
    872  1.1  christos 
    873  1.1  christos 	fmov	d5, x6                                 //AES final block - mov low
    874  1.1  christos 
    875  1.1  christos 	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
    876  1.1  christos 	fmov	v5.d[1], x7                             //AES final block - mov high
    877  1.1  christos 
    878  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
    879  1.1  christos 
    880  1.1  christos 	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
    881  1.1  christos 
    882  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
    883  1.1  christos 
    884  1.1  christos 	eor	v5.16b, v5.16b, v3.16b                            //AES final block - result
    885  1.1  christos 
    886  1.1  christos 	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
    887  1.1  christos 
    888  1.1  christos 	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
    889  1.1  christos 
    890  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
    891  1.1  christos 
    892  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
    893  1.1  christos 
    894  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
    895  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
    896  1.1  christos .L128_enc_blocks_less_than_1:	//blocks	left <= 1
    897  1.1  christos 
    898  1.1  christos 	and	x1, x1, #127                    //bit_length %= 128
    899  1.1  christos 	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
    900  1.1  christos 
    901  1.1  christos 	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
    902  1.1  christos 	sub	x1, x1, #128                    //bit_length -= 128
    903  1.1  christos 
    904  1.1  christos 	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
    905  1.1  christos 
    906  1.1  christos 	and	x1, x1, #127                    //bit_length %= 128
    907  1.1  christos 
    908  1.1  christos 	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
    909  1.1  christos 	cmp	x1, #64
    910  1.1  christos 
    911  1.1  christos 	csel	x6, x13, x14, lt
    912  1.1  christos 	csel	x7, x14, xzr, lt
    913  1.1  christos 
    914  1.1  christos 	fmov	d0, x6                                 //ctr0b is mask for last block
    915  1.1  christos 
    916  1.1  christos 	fmov	v0.d[1], x7
    917  1.1  christos 
    918  1.1  christos 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
    919  1.1  christos 
    920  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final block
    921  1.1  christos 
    922  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
    923  1.1  christos 
    924  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH final block - mid
    925  1.1  christos 
    926  1.1  christos 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
    927  1.1  christos 	ld1	{ v18.16b}, [x2]                            //load existing bytes where the possibly partial last block is to be stored
    928  1.1  christos 
    929  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
    930  1.1  christos #ifndef __AARCH64EB__
    931  1.1  christos 	rev	w9, w12
    932  1.1  christos #else
    933  1.1  christos 	mov	w9, w12
    934  1.1  christos #endif
    935  1.1  christos 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
    936  1.1  christos 
    937  1.1  christos 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
    938  1.1  christos 
    939  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
    940  1.1  christos 
    941  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
    942  1.1  christos 
    943  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
    944  1.1  christos 	movi	v8.8b, #0xc2
    945  1.1  christos 
    946  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
    947  1.1  christos 
    948  1.1  christos 	shl	d8, d8, #56               //mod_constant
    949  1.1  christos 
    950  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
    951  1.1  christos 
    952  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
    953  1.1  christos 
    954  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
    955  1.1  christos 
    956  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
    957  1.1  christos 
    958  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
    959  1.1  christos 
    960  1.1  christos 	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
    961  1.1  christos 
    962  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
    963  1.1  christos 
    964  1.1  christos 	bif	v5.16b, v18.16b, v0.16b                              //insert existing bytes in top end of result before storing
    965  1.1  christos 
    966  1.1  christos 	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
    967  1.1  christos 	st1	{ v5.16b}, [x2]                          //store all 16B
    968  1.1  christos 
    969  1.1  christos 	str	w9, [x16, #12]                          //store the updated counter
    970  1.1  christos 
    971  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
    972  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
    973  1.1  christos 	rev64	v11.16b, v11.16b
    974  1.1  christos 	mov	x0, x15
    975  1.1  christos 	st1	{ v11.16b }, [x3]
    976  1.1  christos 	ldp	x21, x22, [sp, #16]
    977  1.1  christos 	ldp	x23, x24, [sp, #32]
    978  1.1  christos 	ldp	d8, d9, [sp, #48]
    979  1.1  christos 	ldp	d10, d11, [sp, #64]
    980  1.1  christos 	ldp	d12, d13, [sp, #80]
    981  1.1  christos 	ldp	d14, d15, [sp, #96]
    982  1.1  christos 	ldp	x19, x20, [sp], #112
    983  1.1  christos 	ret
    984  1.1  christos 
    985  1.1  christos .L128_enc_ret:
    986  1.1  christos 	mov	w0, #0x0
    987  1.1  christos 	ret
    988  1.1  christos .size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
    989  1.1  christos .globl	aes_gcm_dec_128_kernel
    990  1.1  christos .type	aes_gcm_dec_128_kernel,%function
    991  1.1  christos .align	4
    992  1.1  christos aes_gcm_dec_128_kernel:
    993  1.2  christos 	AARCH64_VALID_CALL_TARGET
    994  1.1  christos 	cbz	x1, .L128_dec_ret
    995  1.1  christos 	stp	x19, x20, [sp, #-112]!
    996  1.1  christos 	mov	x16, x4
    997  1.1  christos 	mov	x8, x5
    998  1.1  christos 	stp	x21, x22, [sp, #16]
    999  1.1  christos 	stp	x23, x24, [sp, #32]
   1000  1.1  christos 	stp	d8, d9, [sp, #48]
   1001  1.1  christos 	stp	d10, d11, [sp, #64]
   1002  1.1  christos 	stp	d12, d13, [sp, #80]
   1003  1.1  christos 	stp	d14, d15, [sp, #96]
   1004  1.1  christos 
   1005  1.1  christos 	lsr	x5, x1, #3              //byte_len
   1006  1.1  christos 	mov	x15, x5
   1007  1.1  christos 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   1008  1.1  christos #ifdef __AARCH64EB__
   1009  1.1  christos 	rev	x10, x10
   1010  1.1  christos 	rev	x11, x11
   1011  1.1  christos #endif
   1012  1.1  christos 	ldp	x13, x14, [x8, #160]                     //load rk10
   1013  1.1  christos #ifdef __AARCH64EB__
   1014  1.1  christos 	ror	x14, x14, 32
   1015  1.1  christos 	ror	x13, x13, 32
   1016  1.1  christos #endif
   1017  1.1  christos 	sub	x5, x5, #1      //byte_len - 1
   1018  1.1  christos 	ld1	{v18.4s}, [x8], #16                                //load rk0
   1019  1.1  christos 
   1020  1.1  christos 	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   1021  1.1  christos 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   1022  1.1  christos 
   1023  1.1  christos 	ldr	q13, [x3, #64]                         //load h2l | h2h
   1024  1.1  christos #ifndef __AARCH64EB__
   1025  1.1  christos 	ext	v13.16b, v13.16b, v13.16b, #8
   1026  1.1  christos #endif
   1027  1.1  christos 	lsr	x12, x11, #32
   1028  1.1  christos 	fmov	d2, x10                               //CTR block 2
   1029  1.1  christos 
   1030  1.1  christos 	ld1	{v19.4s}, [x8], #16                                //load rk1
   1031  1.1  christos 	orr	w11, w11, w11
   1032  1.1  christos 	rev	w12, w12                                //rev_ctr32
   1033  1.1  christos 
   1034  1.1  christos 	fmov	d1, x10                               //CTR block 1
   1035  1.1  christos 	add	w12, w12, #1                            //increment rev_ctr32
   1036  1.1  christos 
   1037  1.1  christos 	aese	v0.16b, v18.16b
   1038  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   1039  1.1  christos 	rev	w9, w12                                 //CTR block 1
   1040  1.1  christos 
   1041  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 1
   1042  1.1  christos 	ld1	{v20.4s}, [x8], #16                                //load rk2
   1043  1.1  christos 	add	w12, w12, #1                            //CTR block 1
   1044  1.1  christos 
   1045  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 1
   1046  1.1  christos 	rev	w9, w12                                 //CTR block 2
   1047  1.1  christos 	add	w12, w12, #1                            //CTR block 2
   1048  1.1  christos 
   1049  1.1  christos 	aese	v0.16b, v19.16b
   1050  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   1051  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 2
   1052  1.1  christos 
   1053  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 2
   1054  1.1  christos 	rev	w9, w12                                 //CTR block 3
   1055  1.1  christos 
   1056  1.1  christos 	fmov	d3, x10                               //CTR block 3
   1057  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 3
   1058  1.1  christos 	add	w12, w12, #1                            //CTR block 3
   1059  1.1  christos 
   1060  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 3
   1061  1.1  christos 	add	x4, x0, x1, lsr #3   //end_input_ptr
   1062  1.1  christos 
   1063  1.1  christos 	aese	v1.16b, v18.16b
   1064  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   1065  1.1  christos 	ld1	{v21.4s}, [x8], #16                                //load rk3
   1066  1.1  christos 
   1067  1.1  christos 	aese	v0.16b, v20.16b
   1068  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   1069  1.1  christos 	ld1	{v22.4s}, [x8], #16                                //load rk4
   1070  1.1  christos 
   1071  1.1  christos 	aese	v2.16b, v18.16b
   1072  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   1073  1.1  christos 	ld1	{v23.4s}, [x8], #16                                //load rk5
   1074  1.1  christos 
   1075  1.1  christos 	aese	v1.16b, v19.16b
   1076  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   1077  1.1  christos 	ld1	{v24.4s}, [x8], #16                                //load rk6
   1078  1.1  christos 
   1079  1.1  christos 	aese	v3.16b, v18.16b
   1080  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   1081  1.1  christos 
   1082  1.1  christos 	aese	v2.16b, v19.16b
   1083  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   1084  1.1  christos 
   1085  1.1  christos 	aese	v1.16b, v20.16b
   1086  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   1087  1.1  christos 
   1088  1.1  christos 	aese	v3.16b, v19.16b
   1089  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   1090  1.1  christos 	ld1	{ v11.16b}, [x3]
   1091  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   1092  1.1  christos 	rev64	v11.16b, v11.16b
   1093  1.1  christos 
   1094  1.1  christos 	aese	v0.16b, v21.16b
   1095  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   1096  1.1  christos 	ld1	{v25.4s}, [x8], #16                                //load rk7
   1097  1.1  christos 
   1098  1.1  christos 	aese	v1.16b, v21.16b
   1099  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   1100  1.1  christos 
   1101  1.1  christos 	aese	v3.16b, v20.16b
   1102  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   1103  1.1  christos 
   1104  1.1  christos 	aese	v2.16b, v20.16b
   1105  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   1106  1.1  christos 	ld1	{v26.4s}, [x8], #16                                //load rk8
   1107  1.1  christos 
   1108  1.1  christos 	aese	v1.16b, v22.16b
   1109  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   1110  1.1  christos 
   1111  1.1  christos 	aese	v3.16b, v21.16b
   1112  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   1113  1.1  christos 
   1114  1.1  christos 	aese	v2.16b, v21.16b
   1115  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   1116  1.1  christos 	ldr	q14, [x3, #80]                         //load h3l | h3h
   1117  1.1  christos #ifndef __AARCH64EB__
   1118  1.1  christos 	ext	v14.16b, v14.16b, v14.16b, #8
   1119  1.1  christos #endif
   1120  1.1  christos 	aese	v0.16b, v22.16b
   1121  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   1122  1.1  christos 	ld1	{v27.4s}, [x8], #16                                //load rk9
   1123  1.1  christos 
   1124  1.1  christos 	aese	v1.16b, v23.16b
   1125  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   1126  1.1  christos 
   1127  1.1  christos 	aese	v2.16b, v22.16b
   1128  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   1129  1.1  christos 
   1130  1.1  christos 	aese	v3.16b, v22.16b
   1131  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   1132  1.1  christos 
   1133  1.1  christos 	aese	v0.16b, v23.16b
   1134  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   1135  1.1  christos 
   1136  1.1  christos 	aese	v2.16b, v23.16b
   1137  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   1138  1.1  christos 	ldr	q12, [x3, #32]                         //load h1l | h1h
   1139  1.1  christos #ifndef __AARCH64EB__
   1140  1.1  christos 	ext	v12.16b, v12.16b, v12.16b, #8
   1141  1.1  christos #endif
   1142  1.1  christos 	aese	v3.16b, v23.16b
   1143  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   1144  1.1  christos 
   1145  1.1  christos 	aese	v0.16b, v24.16b
   1146  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   1147  1.1  christos 
   1148  1.1  christos 	aese	v1.16b, v24.16b
   1149  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   1150  1.1  christos 
   1151  1.1  christos 	aese	v3.16b, v24.16b
   1152  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   1153  1.1  christos 
   1154  1.1  christos 	aese	v2.16b, v24.16b
   1155  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   1156  1.1  christos 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   1157  1.1  christos 
   1158  1.1  christos 	ldr	q15, [x3, #112]                        //load h4l | h4h
   1159  1.1  christos #ifndef __AARCH64EB__
   1160  1.1  christos 	ext	v15.16b, v15.16b, v15.16b, #8
   1161  1.1  christos #endif
   1162  1.1  christos 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   1163  1.1  christos 	add	x5, x5, x0
   1164  1.1  christos 
   1165  1.1  christos 	aese	v1.16b, v25.16b
   1166  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   1167  1.1  christos 
   1168  1.1  christos 	aese	v2.16b, v25.16b
   1169  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   1170  1.1  christos 
   1171  1.1  christos 	aese	v0.16b, v25.16b
   1172  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   1173  1.1  christos 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   1174  1.1  christos 
   1175  1.1  christos 	aese	v3.16b, v25.16b
   1176  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   1177  1.1  christos 
   1178  1.1  christos 	aese	v1.16b, v26.16b
   1179  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   1180  1.1  christos 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   1181  1.1  christos 
   1182  1.1  christos 	aese	v2.16b, v26.16b
   1183  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   1184  1.1  christos 
   1185  1.1  christos 	aese	v3.16b, v26.16b
   1186  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   1187  1.1  christos 
   1188  1.1  christos 	aese	v0.16b, v26.16b
   1189  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   1190  1.1  christos 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   1191  1.1  christos 
   1192  1.1  christos 	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
   1193  1.1  christos 
   1194  1.1  christos 	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
   1195  1.1  christos 
   1196  1.1  christos 	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
   1197  1.1  christos 	cmp	x0, x5                   //check if we have <= 4 blocks
   1198  1.1  christos 
   1199  1.1  christos 	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
   1200  1.1  christos 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   1201  1.1  christos 	b.ge	.L128_dec_tail                                    //handle tail
   1202  1.1  christos 
   1203  1.1  christos 	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0 - load ciphertext; AES block 1 - load ciphertext
   1204  1.1  christos 
   1205  1.1  christos 	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
   1206  1.1  christos 	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
   1207  1.1  christos 
   1208  1.1  christos 	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
   1209  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 0
   1210  1.1  christos 	rev	w9, w12                                 //CTR block 4
   1211  1.1  christos 
   1212  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4
   1213  1.1  christos 	add	w12, w12, #1                            //CTR block 4
   1214  1.1  christos 	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
   1215  1.1  christos 
   1216  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 1
   1217  1.1  christos 	mov	x19, v1.d[0]                            //AES block 1 - mov low
   1218  1.1  christos 
   1219  1.1  christos 	mov	x20, v1.d[1]                            //AES block 1 - mov high
   1220  1.1  christos 
   1221  1.1  christos 	mov	x6, v0.d[0]                            //AES block 0 - mov low
   1222  1.1  christos 	cmp	x0, x5                   //check if we have <= 8 blocks
   1223  1.1  christos 
   1224  1.1  christos 	mov	x7, v0.d[1]                            //AES block 0 - mov high
   1225  1.1  christos 
   1226  1.1  christos 	fmov	d0, x10                               //CTR block 4
   1227  1.1  christos 
   1228  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4
   1229  1.1  christos 	rev	w9, w12                                 //CTR block 5
   1230  1.1  christos 	eor	x19, x19, x13                   //AES block 1 - round 10 low
   1231  1.1  christos #ifdef __AARCH64EB__
   1232  1.1  christos 	rev	x19, x19
   1233  1.1  christos #endif
   1234  1.1  christos 	fmov	d1, x10                               //CTR block 5
   1235  1.1  christos 	add	w12, w12, #1                            //CTR block 5
   1236  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 5
   1237  1.1  christos 
   1238  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 5
   1239  1.1  christos 	rev	w9, w12                                 //CTR block 6
   1240  1.1  christos 	add	w12, w12, #1                            //CTR block 6
   1241  1.1  christos 
   1242  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 6
   1243  1.1  christos 
   1244  1.1  christos 	eor	x20, x20, x14                   //AES block 1 - round 10 high
   1245  1.1  christos #ifdef __AARCH64EB__
   1246  1.1  christos 	rev	x20, x20
   1247  1.1  christos #endif
   1248  1.1  christos 	eor	x6, x6, x13                   //AES block 0 - round 10 low
   1249  1.1  christos #ifdef __AARCH64EB__
   1250  1.1  christos 	rev	x6, x6
   1251  1.1  christos #endif
   1252  1.1  christos 	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
   1253  1.1  christos 
   1254  1.1  christos 	eor	x7, x7, x14                   //AES block 0 - round 10 high
   1255  1.1  christos #ifdef __AARCH64EB__
   1256  1.1  christos 	rev	x7, x7
   1257  1.1  christos #endif
   1258  1.1  christos 	stp	x6, x7, [x2], #16        //AES block 0 - store result
   1259  1.1  christos 
   1260  1.1  christos 	stp	x19, x20, [x2], #16        //AES block 1 - store result
   1261  1.1  christos 	b.ge	.L128_dec_prepretail                              //do prepretail
   1262  1.1  christos 
   1263  1.1  christos .L128_dec_main_loop:	//main	loop start
   1264  1.1  christos 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   1265  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   1266  1.1  christos 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   1267  1.1  christos 
   1268  1.1  christos 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   1269  1.1  christos 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   1270  1.1  christos 
   1271  1.1  christos 	aese	v1.16b, v18.16b
   1272  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   1273  1.1  christos 	fmov	d2, x10                               //CTR block 4k+6
   1274  1.1  christos 
   1275  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   1276  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+6
   1277  1.1  christos 	rev	w9, w12                                 //CTR block 4k+7
   1278  1.1  christos 
   1279  1.1  christos 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   1280  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   1281  1.1  christos 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
   1282  1.1  christos 
   1283  1.1  christos 	aese	v1.16b, v19.16b
   1284  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   1285  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   1286  1.1  christos 
   1287  1.1  christos 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   1288  1.1  christos 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   1289  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   1290  1.1  christos 
   1291  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   1292  1.1  christos 	fmov	d3, x10                               //CTR block 4k+7
   1293  1.1  christos 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
   1294  1.1  christos 
   1295  1.1  christos 	aese	v1.16b, v20.16b
   1296  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   1297  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+7
   1298  1.1  christos 
   1299  1.1  christos 	aese	v2.16b, v18.16b
   1300  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   1301  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   1302  1.1  christos 
   1303  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   1304  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
   1305  1.1  christos 
   1306  1.1  christos 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   1307  1.1  christos 
   1308  1.1  christos 	aese	v1.16b, v21.16b
   1309  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   1310  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   1311  1.1  christos 
   1312  1.1  christos 	aese	v3.16b, v18.16b
   1313  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   1314  1.1  christos 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
   1315  1.1  christos 
   1316  1.1  christos 	aese	v0.16b, v18.16b
   1317  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   1318  1.1  christos 
   1319  1.1  christos 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   1320  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   1321  1.1  christos 
   1322  1.1  christos 	aese	v3.16b, v19.16b
   1323  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   1324  1.1  christos 	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
   1325  1.1  christos #ifdef __AARCH64EB__
   1326  1.1  christos 	rev	x23, x23
   1327  1.1  christos #endif
   1328  1.1  christos 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
   1329  1.1  christos 	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
   1330  1.1  christos #ifdef __AARCH64EB__
   1331  1.1  christos 	rev	x22, x22
   1332  1.1  christos #endif
   1333  1.1  christos 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   1334  1.1  christos 
   1335  1.1  christos 	aese	v0.16b, v19.16b
   1336  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   1337  1.1  christos 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
   1338  1.1  christos 
   1339  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   1340  1.1  christos 
   1341  1.1  christos 	aese	v3.16b, v20.16b
   1342  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   1343  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   1344  1.1  christos 
   1345  1.1  christos 	aese	v0.16b, v20.16b
   1346  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   1347  1.1  christos 
   1348  1.1  christos 	aese	v1.16b, v22.16b
   1349  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   1350  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
   1351  1.1  christos 
   1352  1.1  christos 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   1353  1.1  christos 
   1354  1.1  christos 	aese	v0.16b, v21.16b
   1355  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   1356  1.1  christos 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   1357  1.1  christos 
   1358  1.1  christos 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   1359  1.1  christos 
   1360  1.1  christos 	aese	v2.16b, v19.16b
   1361  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   1362  1.1  christos 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   1363  1.1  christos 
   1364  1.1  christos 	aese	v0.16b, v22.16b
   1365  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   1366  1.1  christos 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
   1367  1.1  christos 
   1368  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   1369  1.1  christos 	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
   1370  1.1  christos #ifdef __AARCH64EB__
   1371  1.1  christos 	rev	x24, x24
   1372  1.1  christos #endif
   1373  1.1  christos 	aese	v2.16b, v20.16b
   1374  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   1375  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   1376  1.1  christos 
   1377  1.1  christos 	aese	v1.16b, v23.16b
   1378  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   1379  1.1  christos 	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
   1380  1.1  christos #ifdef __AARCH64EB__
   1381  1.1  christos 	rev	x21, x21
   1382  1.1  christos #endif
   1383  1.1  christos 	aese	v0.16b, v23.16b
   1384  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   1385  1.1  christos 	movi	v8.8b, #0xc2
   1386  1.1  christos 
   1387  1.1  christos 	aese	v2.16b, v21.16b
   1388  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   1389  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
   1390  1.1  christos 
   1391  1.1  christos 	aese	v1.16b, v24.16b
   1392  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   1393  1.1  christos 
   1394  1.1  christos 	aese	v0.16b, v24.16b
   1395  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   1396  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   1397  1.1  christos 
   1398  1.1  christos 	aese	v2.16b, v22.16b
   1399  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   1400  1.1  christos 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   1401  1.1  christos 
   1402  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   1403  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
   1404  1.1  christos 	ld1	{v4.16b}, [x0], #16                       //AES block 4k+3 - load ciphertext
   1405  1.1  christos 
   1406  1.1  christos 	aese	v1.16b, v25.16b
   1407  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   1408  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+7
   1409  1.1  christos 
   1410  1.1  christos 	aese	v0.16b, v25.16b
   1411  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   1412  1.1  christos 	shl	d8, d8, #56               //mod_constant
   1413  1.1  christos 
   1414  1.1  christos 	aese	v2.16b, v23.16b
   1415  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   1416  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   1417  1.1  christos 
   1418  1.1  christos 	aese	v1.16b, v26.16b
   1419  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   1420  1.1  christos 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   1421  1.1  christos 
   1422  1.1  christos 	aese	v0.16b, v26.16b
   1423  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   1424  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   1425  1.1  christos 
   1426  1.1  christos 	aese	v3.16b, v21.16b
   1427  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   1428  1.1  christos 	rev	w9, w12                                 //CTR block 4k+8
   1429  1.1  christos 
   1430  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   1431  1.1  christos 	ld1	{v5.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
   1432  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   1433  1.1  christos 
   1434  1.1  christos 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
   1435  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   1436  1.1  christos 
   1437  1.1  christos 	aese	v3.16b, v22.16b
   1438  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   1439  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   1440  1.1  christos 
   1441  1.1  christos 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
   1442  1.1  christos 
   1443  1.1  christos 	aese	v2.16b, v24.16b
   1444  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   1445  1.1  christos 	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
   1446  1.1  christos 
   1447  1.1  christos 	aese	v3.16b, v23.16b
   1448  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   1449  1.1  christos 	ld1	{v6.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
   1450  1.1  christos 
   1451  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+8
   1452  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   1453  1.1  christos 	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
   1454  1.1  christos 
   1455  1.1  christos 	aese	v2.16b, v25.16b
   1456  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   1457  1.1  christos 	ld1	{v7.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
   1458  1.1  christos 
   1459  1.1  christos 	aese	v3.16b, v24.16b
   1460  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   1461  1.1  christos 
   1462  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
   1463  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   1464  1.1  christos 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   1465  1.1  christos 
   1466  1.1  christos 	aese	v2.16b, v26.16b
   1467  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   1468  1.1  christos 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   1469  1.1  christos 
   1470  1.1  christos 	aese	v3.16b, v25.16b
   1471  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   1472  1.1  christos 	fmov	d0, x10                               //CTR block 4k+8
   1473  1.1  christos 
   1474  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   1475  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4k+8
   1476  1.1  christos 	rev	w9, w12                                 //CTR block 4k+9
   1477  1.1  christos 
   1478  1.1  christos 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
   1479  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   1480  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   1481  1.1  christos 
   1482  1.1  christos 	aese	v3.16b, v26.16b
   1483  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   1484  1.1  christos 	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
   1485  1.1  christos #ifdef __AARCH64EB__
   1486  1.1  christos 	rev	x7, x7
   1487  1.1  christos #endif
   1488  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   1489  1.1  christos 	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
   1490  1.1  christos 	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
   1491  1.1  christos #ifdef __AARCH64EB__
   1492  1.1  christos 	rev	x6, x6
   1493  1.1  christos #endif
   1494  1.1  christos 	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
   1495  1.1  christos 	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
   1496  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+9
   1497  1.1  christos 
   1498  1.1  christos 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
   1499  1.1  christos 	fmov	d1, x10                               //CTR block 4k+9
   1500  1.1  christos 	cmp	x0, x5                   //.LOOP CONTROL
   1501  1.1  christos 
   1502  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
   1503  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   1504  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 4k+9
   1505  1.1  christos 
   1506  1.1  christos 	rev	w9, w12                                 //CTR block 4k+10
   1507  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+10
   1508  1.1  christos 
   1509  1.1  christos 	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
   1510  1.1  christos #ifdef __AARCH64EB__
   1511  1.1  christos 	rev	x20, x20
   1512  1.1  christos #endif
   1513  1.1  christos 	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
   1514  1.1  christos 
   1515  1.1  christos 	eor	x19, x19, x13                   //AES block 4k+5 - round 10 low
   1516  1.1  christos #ifdef __AARCH64EB__
   1517  1.1  christos 	rev	x19, x19
   1518  1.1  christos #endif
   1519  1.1  christos 	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
   1520  1.1  christos 
   1521  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   1522  1.1  christos 	b.lt	.L128_dec_main_loop
   1523  1.1  christos 
   1524  1.1  christos .L128_dec_prepretail:	//PREPRETAIL
   1525  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   1526  1.1  christos 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   1527  1.1  christos 	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
   1528  1.1  christos 
   1529  1.1  christos 	aese	v0.16b, v18.16b
   1530  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   1531  1.1  christos 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   1532  1.1  christos 
   1533  1.1  christos 	aese	v1.16b, v18.16b
   1534  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   1535  1.1  christos 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   1536  1.1  christos 
   1537  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   1538  1.1  christos 	fmov	d2, x10                               //CTR block 4k+6
   1539  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   1540  1.1  christos 
   1541  1.1  christos 	aese	v0.16b, v19.16b
   1542  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   1543  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+6
   1544  1.1  christos 
   1545  1.1  christos 	rev	w9, w12                                 //CTR block 4k+7
   1546  1.1  christos 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   1547  1.1  christos 	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
   1548  1.1  christos 
   1549  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   1550  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   1551  1.1  christos 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   1552  1.1  christos 
   1553  1.1  christos 	aese	v1.16b, v19.16b
   1554  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   1555  1.1  christos 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   1556  1.1  christos 
   1557  1.1  christos 	aese	v0.16b, v20.16b
   1558  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   1559  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   1560  1.1  christos 
   1561  1.1  christos 	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   1562  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   1563  1.1  christos 	fmov	d3, x10                               //CTR block 4k+7
   1564  1.1  christos 
   1565  1.1  christos 	aese	v2.16b, v18.16b
   1566  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   1567  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+7
   1568  1.1  christos 
   1569  1.1  christos 	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
   1570  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   1571  1.1  christos 
   1572  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   1573  1.1  christos 
   1574  1.1  christos 	aese	v2.16b, v19.16b
   1575  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   1576  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   1577  1.1  christos 
   1578  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   1579  1.1  christos 
   1580  1.1  christos 	aese	v3.16b, v18.16b
   1581  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   1582  1.1  christos 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   1583  1.1  christos 
   1584  1.1  christos 	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   1585  1.1  christos 
   1586  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   1587  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
   1588  1.1  christos 
   1589  1.1  christos 	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   1590  1.1  christos 
   1591  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   1592  1.1  christos 	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
   1593  1.1  christos 
   1594  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
   1595  1.1  christos 
   1596  1.1  christos 	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   1597  1.1  christos 
   1598  1.1  christos 	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   1599  1.1  christos 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   1600  1.1  christos 
   1601  1.1  christos 	aese	v1.16b, v20.16b
   1602  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   1603  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   1604  1.1  christos 
   1605  1.1  christos 	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   1606  1.1  christos 
   1607  1.1  christos 	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
   1608  1.1  christos 	movi	v8.8b, #0xc2
   1609  1.1  christos 
   1610  1.1  christos 	aese	v3.16b, v19.16b
   1611  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   1612  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   1613  1.1  christos 
   1614  1.1  christos 	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
   1615  1.1  christos 
   1616  1.1  christos 	aese	v2.16b, v20.16b
   1617  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   1618  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
   1619  1.1  christos 
   1620  1.1  christos 	aese	v3.16b, v20.16b
   1621  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   1622  1.1  christos 	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
   1623  1.1  christos #ifdef __AARCH64EB__
   1624  1.1  christos 	rev	x23, x23
   1625  1.1  christos #endif
   1626  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   1627  1.1  christos 	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
   1628  1.1  christos #ifdef __AARCH64EB__
   1629  1.1  christos 	rev	x21, x21
   1630  1.1  christos #endif
   1631  1.1  christos 	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
   1632  1.1  christos 
   1633  1.1  christos 	aese	v2.16b, v21.16b
   1634  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   1635  1.1  christos 
   1636  1.1  christos 	aese	v1.16b, v21.16b
   1637  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   1638  1.1  christos 	shl	d8, d8, #56               //mod_constant
   1639  1.1  christos 
   1640  1.1  christos 	aese	v0.16b, v21.16b
   1641  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   1642  1.1  christos 
   1643  1.1  christos 	aese	v2.16b, v22.16b
   1644  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   1645  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   1646  1.1  christos 
   1647  1.1  christos 	aese	v1.16b, v22.16b
   1648  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   1649  1.1  christos 
   1650  1.1  christos 	aese	v3.16b, v21.16b
   1651  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   1652  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   1653  1.1  christos 
   1654  1.1  christos 	aese	v2.16b, v23.16b
   1655  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   1656  1.1  christos 
   1657  1.1  christos 	aese	v1.16b, v23.16b
   1658  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   1659  1.1  christos 
   1660  1.1  christos 	aese	v3.16b, v22.16b
   1661  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   1662  1.1  christos 
   1663  1.1  christos 	aese	v0.16b, v22.16b
   1664  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   1665  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   1666  1.1  christos 
   1667  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   1668  1.1  christos 
   1669  1.1  christos 	aese	v1.16b, v24.16b
   1670  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   1671  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   1672  1.1  christos 
   1673  1.1  christos 	aese	v3.16b, v23.16b
   1674  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   1675  1.1  christos 
   1676  1.1  christos 	aese	v0.16b, v23.16b
   1677  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   1678  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   1679  1.1  christos 
   1680  1.1  christos 	aese	v1.16b, v25.16b
   1681  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   1682  1.1  christos 
   1683  1.1  christos 	aese	v2.16b, v24.16b
   1684  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   1685  1.1  christos 
   1686  1.1  christos 	aese	v0.16b, v24.16b
   1687  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   1688  1.1  christos 
   1689  1.1  christos 	aese	v1.16b, v26.16b
   1690  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   1691  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   1692  1.1  christos 
   1693  1.1  christos 	aese	v3.16b, v24.16b
   1694  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   1695  1.1  christos 
   1696  1.1  christos 	aese	v0.16b, v25.16b
   1697  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   1698  1.1  christos 
   1699  1.1  christos 	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
   1700  1.1  christos 
   1701  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   1702  1.1  christos 	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
   1703  1.1  christos #ifdef __AARCH64EB__
   1704  1.1  christos 	rev	x24, x24
   1705  1.1  christos #endif
   1706  1.1  christos 	aese	v2.16b, v25.16b
   1707  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   1708  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   1709  1.1  christos 
   1710  1.1  christos 	aese	v3.16b, v25.16b
   1711  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   1712  1.1  christos 
   1713  1.1  christos 	aese	v0.16b, v26.16b
   1714  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   1715  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   1716  1.1  christos 
   1717  1.1  christos 	aese	v2.16b, v26.16b
   1718  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   1719  1.1  christos 
   1720  1.1  christos 	aese	v3.16b, v26.16b
   1721  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   1722  1.1  christos 	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
   1723  1.1  christos #ifdef __AARCH64EB__
   1724  1.1  christos 	rev	x22, x22
   1725  1.1  christos #endif
   1726  1.1  christos 	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
   1727  1.1  christos 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   1728  1.1  christos 
   1729  1.1  christos 	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
   1730  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+7
   1731  1.1  christos 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   1732  1.1  christos 
   1733  1.1  christos 	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
   1734  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   1735  1.1  christos .L128_dec_tail:	//TAIL
   1736  1.1  christos 
   1737  1.1  christos 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   1738  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
   1739  1.1  christos 
   1740  1.1  christos 	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
   1741  1.1  christos 
   1742  1.1  christos 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   1743  1.1  christos 
   1744  1.1  christos 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   1745  1.1  christos 
   1746  1.1  christos 	cmp	x5, #48
   1747  1.1  christos 
   1748  1.1  christos 	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
   1749  1.1  christos #ifdef __AARCH64EB__
   1750  1.1  christos 	rev	x7, x7
   1751  1.1  christos #endif
   1752  1.1  christos 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   1753  1.1  christos 	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
   1754  1.1  christos #ifdef __AARCH64EB__
   1755  1.1  christos 	rev	x6, x6
   1756  1.1  christos #endif
   1757  1.1  christos 	b.gt	.L128_dec_blocks_more_than_3
   1758  1.1  christos 
   1759  1.1  christos 	mov	v3.16b, v2.16b
   1760  1.1  christos 	sub	w12, w12, #1
   1761  1.1  christos 	movi	v11.8b, #0
   1762  1.1  christos 
   1763  1.1  christos 	movi	v9.8b, #0
   1764  1.1  christos 	mov	v2.16b, v1.16b
   1765  1.1  christos 
   1766  1.1  christos 	movi	v10.8b, #0
   1767  1.1  christos 	cmp	x5, #32
   1768  1.1  christos 	b.gt	.L128_dec_blocks_more_than_2
   1769  1.1  christos 
   1770  1.1  christos 	cmp	x5, #16
   1771  1.1  christos 
   1772  1.1  christos 	mov	v3.16b, v1.16b
   1773  1.1  christos 	sub	w12, w12, #1
   1774  1.1  christos 	b.gt	.L128_dec_blocks_more_than_1
   1775  1.1  christos 
   1776  1.1  christos 	sub	w12, w12, #1
   1777  1.1  christos 	b	.L128_dec_blocks_less_than_1
   1778  1.1  christos .L128_dec_blocks_more_than_3:	//blocks	left >  3
   1779  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
   1780  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
   1781  1.1  christos 
   1782  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1783  1.1  christos 
   1784  1.1  christos 	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
   1785  1.1  christos 	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
   1786  1.1  christos 	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
   1787  1.1  christos 
   1788  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
   1789  1.1  christos 	mov	x7, v0.d[1]                            //AES final-2 block - mov high
   1790  1.1  christos 
   1791  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
   1792  1.1  christos 	mov	x6, v0.d[0]                            //AES final-2 block - mov low
   1793  1.1  christos 
   1794  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
   1795  1.1  christos 
   1796  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
   1797  1.1  christos 
   1798  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   1799  1.1  christos 	eor	x7, x7, x14                   //AES final-2 block - round 10 high
   1800  1.1  christos #ifdef __AARCH64EB__
   1801  1.1  christos 	rev	x7, x7
   1802  1.1  christos #endif
   1803  1.1  christos 	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
   1804  1.1  christos 	eor	x6, x6, x13                   //AES final-2 block - round 10 low
   1805  1.1  christos #ifdef __AARCH64EB__
   1806  1.1  christos 	rev	x6, x6
   1807  1.1  christos #endif
   1808  1.1  christos .L128_dec_blocks_more_than_2:	//blocks	left >  2
   1809  1.1  christos 
   1810  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
   1811  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
   1812  1.1  christos 
   1813  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1814  1.1  christos 
   1815  1.1  christos 	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
   1816  1.1  christos 	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
   1817  1.1  christos 
   1818  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
   1819  1.1  christos 
   1820  1.1  christos 	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
   1821  1.1  christos 
   1822  1.1  christos 	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
   1823  1.1  christos 	mov	x6, v0.d[0]                            //AES final-1 block - mov low
   1824  1.1  christos 
   1825  1.1  christos 	mov	x7, v0.d[1]                            //AES final-1 block - mov high
   1826  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
   1827  1.1  christos 
   1828  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   1829  1.1  christos 
   1830  1.1  christos 	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
   1831  1.1  christos 
   1832  1.1  christos 	eor	x6, x6, x13                   //AES final-1 block - round 10 low
   1833  1.1  christos #ifdef __AARCH64EB__
   1834  1.1  christos 	rev	x6, x6
   1835  1.1  christos #endif
   1836  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
   1837  1.1  christos 
   1838  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
   1839  1.1  christos 
   1840  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
   1841  1.1  christos 	eor	x7, x7, x14                   //AES final-1 block - round 10 high
   1842  1.1  christos #ifdef __AARCH64EB__
   1843  1.1  christos 	rev	x7, x7
   1844  1.1  christos #endif
   1845  1.1  christos .L128_dec_blocks_more_than_1:	//blocks	left >  1
   1846  1.1  christos 
   1847  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
   1848  1.1  christos 
   1849  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
   1850  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1851  1.1  christos 
   1852  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
   1853  1.1  christos 
   1854  1.1  christos 	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
   1855  1.1  christos 
   1856  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
   1857  1.1  christos 
   1858  1.1  christos 	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
   1859  1.1  christos 	mov	x6, v0.d[0]                            //AES final block - mov low
   1860  1.1  christos 
   1861  1.1  christos 	mov	x7, v0.d[1]                            //AES final block - mov high
   1862  1.1  christos 	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
   1863  1.1  christos 
   1864  1.1  christos 	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
   1865  1.1  christos 
   1866  1.1  christos 	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
   1867  1.1  christos 
   1868  1.1  christos 	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
   1869  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   1870  1.1  christos 
   1871  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
   1872  1.1  christos 
   1873  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
   1874  1.1  christos 	eor	x7, x7, x14                   //AES final block - round 10 high
   1875  1.1  christos #ifdef __AARCH64EB__
   1876  1.1  christos 	rev	x7, x7
   1877  1.1  christos #endif
   1878  1.1  christos 	eor	x6, x6, x13                   //AES final block - round 10 low
   1879  1.1  christos #ifdef __AARCH64EB__
   1880  1.1  christos 	rev	x6, x6
   1881  1.1  christos #endif
   1882  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
   1883  1.1  christos .L128_dec_blocks_less_than_1:	//blocks	left <= 1
   1884  1.1  christos 
   1885  1.1  christos 	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
   1886  1.1  christos 	and	x1, x1, #127                    //bit_length %= 128
   1887  1.1  christos 
   1888  1.1  christos 	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
   1889  1.1  christos 	sub	x1, x1, #128                    //bit_length -= 128
   1890  1.1  christos 
   1891  1.1  christos 	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
   1892  1.1  christos 
   1893  1.1  christos 	and	x1, x1, #127                    //bit_length %= 128
   1894  1.1  christos 
   1895  1.1  christos 	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
   1896  1.1  christos 	cmp	x1, #64
   1897  1.1  christos 
   1898  1.1  christos 	csel	x10, x14, xzr, lt
   1899  1.1  christos 	csel	x9, x13, x14, lt
   1900  1.1  christos 
   1901  1.1  christos 	fmov	d0, x9                                   //ctr0b is mask for last block
   1902  1.1  christos 
   1903  1.1  christos 	mov	v0.d[1], x10
   1904  1.1  christos 
   1905  1.1  christos 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
   1906  1.1  christos 
   1907  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final block
   1908  1.1  christos 
   1909  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   1910  1.1  christos 
   1911  1.1  christos 	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
   1912  1.1  christos 
   1913  1.1  christos 	and	x7, x7, x10
   1914  1.1  christos 
   1915  1.1  christos 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
   1916  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH final block - mid
   1917  1.1  christos 
   1918  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
   1919  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
   1920  1.1  christos 
   1921  1.1  christos 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
   1922  1.1  christos 
   1923  1.1  christos 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
   1924  1.1  christos 	bic	x4, x4, x9           //mask out low existing bytes
   1925  1.1  christos 	and	x6, x6, x9
   1926  1.1  christos 
   1927  1.1  christos #ifndef __AARCH64EB__
   1928  1.1  christos 	rev	w9, w12
   1929  1.1  christos #else
   1930  1.1  christos 	mov	w9, w12
   1931  1.1  christos #endif
   1932  1.1  christos 
   1933  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
   1934  1.1  christos 	movi	v8.8b, #0xc2
   1935  1.1  christos 
   1936  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
   1937  1.1  christos 
   1938  1.1  christos 	bic	x5, x5, x10   //mask out high existing bytes
   1939  1.1  christos 	shl	d8, d8, #56               //mod_constant
   1940  1.1  christos 
   1941  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   1942  1.1  christos 
   1943  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   1944  1.1  christos 
   1945  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   1946  1.1  christos 
   1947  1.1  christos 	orr	x6, x6, x4
   1948  1.1  christos 	str	w9, [x16, #12]                          //store the updated counter
   1949  1.1  christos 
   1950  1.1  christos 	orr	x7, x7, x5
   1951  1.1  christos 	stp	x6, x7, [x2]
   1952  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   1953  1.1  christos 
   1954  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   1955  1.1  christos 
   1956  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   1957  1.1  christos 
   1958  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   1959  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   1960  1.1  christos 
   1961  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   1962  1.1  christos 
   1963  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   1964  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   1965  1.1  christos 	rev64	v11.16b, v11.16b
   1966  1.1  christos 	mov	x0, x15
   1967  1.1  christos 	st1	{ v11.16b }, [x3]
   1968  1.1  christos 
   1969  1.1  christos 	ldp	x21, x22, [sp, #16]
   1970  1.1  christos 	ldp	x23, x24, [sp, #32]
   1971  1.1  christos 	ldp	d8, d9, [sp, #48]
   1972  1.1  christos 	ldp	d10, d11, [sp, #64]
   1973  1.1  christos 	ldp	d12, d13, [sp, #80]
   1974  1.1  christos 	ldp	d14, d15, [sp, #96]
   1975  1.1  christos 	ldp	x19, x20, [sp], #112
   1976  1.1  christos 	ret
   1977  1.1  christos 
   1978  1.1  christos .L128_dec_ret:
   1979  1.1  christos 	mov	w0, #0x0
   1980  1.1  christos 	ret
   1981  1.1  christos .size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
   1982  1.1  christos .globl	aes_gcm_enc_192_kernel
   1983  1.1  christos .type	aes_gcm_enc_192_kernel,%function
   1984  1.1  christos .align	4
   1985  1.1  christos aes_gcm_enc_192_kernel:
   1986  1.2  christos 	AARCH64_VALID_CALL_TARGET
   1987  1.1  christos 	cbz	x1, .L192_enc_ret
   1988  1.1  christos 	stp	x19, x20, [sp, #-112]!
   1989  1.1  christos 	mov	x16, x4
   1990  1.1  christos 	mov	x8, x5
   1991  1.1  christos 	stp	x21, x22, [sp, #16]
   1992  1.1  christos 	stp	x23, x24, [sp, #32]
   1993  1.1  christos 	stp	d8, d9, [sp, #48]
   1994  1.1  christos 	stp	d10, d11, [sp, #64]
   1995  1.1  christos 	stp	d12, d13, [sp, #80]
   1996  1.1  christos 	stp	d14, d15, [sp, #96]
   1997  1.1  christos 
   1998  1.1  christos 	ldp	x10, x11, [x16]             //ctr96_b64, ctr96_t32
   1999  1.1  christos #ifdef __AARCH64EB__
   2000  1.1  christos 	rev	x10, x10
   2001  1.1  christos 	rev	x11, x11
   2002  1.1  christos #endif
   2003  1.1  christos 	ldp	x13, x14, [x8, #192]                     //load rk12
   2004  1.1  christos #ifdef __AARCH64EB__
   2005  1.1  christos 	ror	x13, x13, #32
   2006  1.1  christos 	ror	x14, x14, #32
   2007  1.1  christos #endif
   2008  1.1  christos 	ld1	{v18.4s}, [x8], #16	                             //load rk0
   2009  1.1  christos 
   2010  1.1  christos 	ld1	{v19.4s}, [x8], #16	                             //load rk1
   2011  1.1  christos 
   2012  1.1  christos 	ld1	{v20.4s}, [x8], #16	                             //load rk2
   2013  1.1  christos 
   2014  1.1  christos 	lsr	x12, x11, #32
   2015  1.1  christos 	ld1	{v21.4s}, [x8], #16	                             //load rk3
   2016  1.1  christos 	orr	w11, w11, w11
   2017  1.1  christos 
   2018  1.1  christos 	ld1	{v22.4s}, [x8], #16	                             //load rk4
   2019  1.1  christos 	rev	w12, w12                               //rev_ctr32
   2020  1.1  christos 
   2021  1.1  christos 	add	w12, w12, #1                           //increment rev_ctr32
   2022  1.1  christos 	fmov	d3, x10                              //CTR block 3
   2023  1.1  christos 
   2024  1.1  christos 	rev	w9, w12                                //CTR block 1
   2025  1.1  christos 	add	w12, w12, #1                           //CTR block 1
   2026  1.1  christos 	fmov	d1, x10                              //CTR block 1
   2027  1.1  christos 
   2028  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 1
   2029  1.1  christos 	ld1	{ v0.16b}, [x16]                            //special case vector load initial counter so we can start first AES block as quickly as possible
   2030  1.1  christos 
   2031  1.1  christos 	fmov	v1.d[1], x9                              //CTR block 1
   2032  1.1  christos 	rev	w9, w12                                //CTR block 2
   2033  1.1  christos 	add	w12, w12, #1                           //CTR block 2
   2034  1.1  christos 
   2035  1.1  christos 	fmov	d2, x10                              //CTR block 2
   2036  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 2
   2037  1.1  christos 
   2038  1.1  christos 	fmov	v2.d[1], x9                              //CTR block 2
   2039  1.1  christos 	rev	w9, w12                                //CTR block 3
   2040  1.1  christos 
   2041  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 3
   2042  1.1  christos 	ld1	{v23.4s}, [x8], #16	                             //load rk5
   2043  1.1  christos 
   2044  1.1  christos 	fmov	v3.d[1], x9                              //CTR block 3
   2045  1.1  christos 
   2046  1.1  christos 	ld1	{v24.4s}, [x8], #16	                             //load rk6
   2047  1.1  christos 
   2048  1.1  christos 	ld1	{v25.4s}, [x8], #16	                             //load rk7
   2049  1.1  christos 
   2050  1.1  christos 	aese	v0.16b, v18.16b
   2051  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 0
   2052  1.1  christos 	ld1	{ v11.16b}, [x3]
   2053  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   2054  1.1  christos 	rev64	v11.16b, v11.16b
   2055  1.1  christos 
   2056  1.1  christos 	aese	v3.16b, v18.16b
   2057  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 0
   2058  1.1  christos 	ld1	{v26.4s}, [x8], #16	                             //load rk8
   2059  1.1  christos 
   2060  1.1  christos 	aese	v1.16b, v18.16b
   2061  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 0
   2062  1.1  christos 	ldr	q15, [x3, #112]                       //load h4l | h4h
   2063  1.1  christos #ifndef __AARCH64EB__
   2064  1.1  christos 	ext	v15.16b, v15.16b, v15.16b, #8
   2065  1.1  christos #endif
   2066  1.1  christos 	aese	v2.16b, v18.16b
   2067  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 0
   2068  1.1  christos 	ld1	{v27.4s}, [x8], #16	                             //load rk9
   2069  1.1  christos 
   2070  1.1  christos 	aese	v0.16b, v19.16b
   2071  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 1
   2072  1.1  christos 	ld1	{v28.4s}, [x8], #16	                         //load rk10
   2073  1.1  christos 
   2074  1.1  christos 	aese	v1.16b, v19.16b
   2075  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 1
   2076  1.1  christos 	ldr	q12, [x3, #32]                        //load h1l | h1h
   2077  1.1  christos #ifndef __AARCH64EB__
   2078  1.1  christos 	ext	v12.16b, v12.16b, v12.16b, #8
   2079  1.1  christos #endif
   2080  1.1  christos 	aese	v2.16b, v19.16b
   2081  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 1
   2082  1.1  christos 	ld1	{v29.4s}, [x8], #16	                         //load rk11
   2083  1.1  christos 
   2084  1.1  christos 	aese	v3.16b, v19.16b
   2085  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 1
   2086  1.1  christos 	ldr	q14, [x3, #80]                        //load h3l | h3h
   2087  1.1  christos #ifndef __AARCH64EB__
   2088  1.1  christos 	ext	v14.16b, v14.16b, v14.16b, #8
   2089  1.1  christos #endif
   2090  1.1  christos 	aese	v0.16b, v20.16b
   2091  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 2
   2092  1.1  christos 
   2093  1.1  christos 	aese	v2.16b, v20.16b
   2094  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 2
   2095  1.1  christos 
   2096  1.1  christos 	aese	v3.16b, v20.16b
   2097  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 2
   2098  1.1  christos 
   2099  1.1  christos 	aese	v0.16b, v21.16b
   2100  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 3
   2101  1.1  christos 	trn1	v9.2d, v14.2d,    v15.2d                     //h4h | h3h
   2102  1.1  christos 
   2103  1.1  christos 	aese	v2.16b, v21.16b
   2104  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 3
   2105  1.1  christos 
   2106  1.1  christos 	aese	v1.16b, v20.16b
   2107  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 2
   2108  1.1  christos 	trn2	v17.2d,  v14.2d,    v15.2d                     //h4l | h3l
   2109  1.1  christos 
   2110  1.1  christos 	aese	v0.16b, v22.16b
   2111  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 4
   2112  1.1  christos 
   2113  1.1  christos 	aese	v3.16b, v21.16b
   2114  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 3
   2115  1.1  christos 
   2116  1.1  christos 	aese	v1.16b, v21.16b
   2117  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 3
   2118  1.1  christos 
   2119  1.1  christos 	aese	v0.16b, v23.16b
   2120  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 5
   2121  1.1  christos 
   2122  1.1  christos 	aese	v2.16b, v22.16b
   2123  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 4
   2124  1.1  christos 
   2125  1.1  christos 	aese	v1.16b, v22.16b
   2126  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 4
   2127  1.1  christos 
   2128  1.1  christos 	aese	v0.16b, v24.16b
   2129  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 6
   2130  1.1  christos 
   2131  1.1  christos 	aese	v3.16b, v22.16b
   2132  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 4
   2133  1.1  christos 
   2134  1.1  christos 	aese	v2.16b, v23.16b
   2135  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 5
   2136  1.1  christos 
   2137  1.1  christos 	aese	v1.16b, v23.16b
   2138  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 5
   2139  1.1  christos 
   2140  1.1  christos 	aese	v3.16b, v23.16b
   2141  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 5
   2142  1.1  christos 
   2143  1.1  christos 	aese	v2.16b, v24.16b
   2144  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 6
   2145  1.1  christos 	ldr	q13, [x3, #64]                        //load h2l | h2h
   2146  1.1  christos #ifndef __AARCH64EB__
   2147  1.1  christos 	ext	v13.16b, v13.16b, v13.16b, #8
   2148  1.1  christos #endif
   2149  1.1  christos 	aese	v1.16b, v24.16b
   2150  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 6
   2151  1.1  christos 
   2152  1.1  christos 	aese	v3.16b, v24.16b
   2153  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 6
   2154  1.1  christos 
   2155  1.1  christos 	aese	v0.16b, v25.16b
   2156  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 7
   2157  1.1  christos 
   2158  1.1  christos 	aese	v1.16b, v25.16b
   2159  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 7
   2160  1.1  christos 	trn2	v16.2d,  v12.2d,    v13.2d                     //h2l | h1l
   2161  1.1  christos 
   2162  1.1  christos 	aese	v3.16b, v25.16b
   2163  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 7
   2164  1.1  christos 
   2165  1.1  christos 	aese	v0.16b, v26.16b
   2166  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 8
   2167  1.1  christos 
   2168  1.1  christos 	aese	v2.16b, v25.16b
   2169  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 7
   2170  1.1  christos 	trn1	v8.2d,    v12.2d,    v13.2d                     //h2h | h1h
   2171  1.1  christos 
   2172  1.1  christos 	aese	v1.16b, v26.16b
   2173  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 8
   2174  1.1  christos 
   2175  1.1  christos 	aese	v3.16b, v26.16b
   2176  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 8
   2177  1.1  christos 
   2178  1.1  christos 	aese	v2.16b, v26.16b
   2179  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 8
   2180  1.1  christos 
   2181  1.1  christos 	aese	v0.16b, v27.16b
   2182  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 9
   2183  1.1  christos 
   2184  1.1  christos 	aese	v3.16b, v27.16b
   2185  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 9
   2186  1.1  christos 
   2187  1.1  christos 	aese	v2.16b, v27.16b
   2188  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 9
   2189  1.1  christos 
   2190  1.1  christos 	aese	v1.16b, v27.16b
   2191  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 9
   2192  1.1  christos 
   2193  1.1  christos 	aese	v0.16b, v28.16b
   2194  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 0 - round 10
   2195  1.1  christos 
   2196  1.1  christos 	aese	v2.16b, v28.16b
   2197  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 2 - round 10
   2198  1.1  christos 
   2199  1.1  christos 	aese	v1.16b, v28.16b
   2200  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 1 - round 10
   2201  1.1  christos 	lsr	x5, x1, #3             //byte_len
   2202  1.1  christos 	mov	x15, x5
   2203  1.1  christos 
   2204  1.1  christos 	aese	v3.16b, v28.16b
   2205  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 3 - round 10
   2206  1.1  christos 	sub	x5, x5, #1     //byte_len - 1
   2207  1.1  christos 
   2208  1.1  christos 	eor	v16.16b, v16.16b, v8.16b                    //h2k | h1k
   2209  1.1  christos 	and	x5, x5, #0xffffffffffffffc0   //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   2210  1.1  christos 
   2211  1.1  christos 	eor	v17.16b, v17.16b, v9.16b                 //h4k | h3k
   2212  1.1  christos 
   2213  1.1  christos 	aese	v2.16b, v29.16b                                    //AES block 2 - round 11
   2214  1.1  christos 	add	x4, x0, x1, lsr #3  //end_input_ptr
   2215  1.1  christos 	add	x5, x5, x0
   2216  1.1  christos 
   2217  1.1  christos 	aese	v1.16b, v29.16b                                    //AES block 1 - round 11
   2218  1.1  christos 	cmp	x0, x5                  //check if we have <= 4 blocks
   2219  1.1  christos 
   2220  1.1  christos 	aese	v0.16b, v29.16b                                    //AES block 0 - round 11
   2221  1.1  christos 	add	w12, w12, #1                           //CTR block 3
   2222  1.1  christos 
   2223  1.1  christos 	aese	v3.16b, v29.16b                                    //AES block 3 - round 11
   2224  1.1  christos 	b.ge	.L192_enc_tail                                   //handle tail
   2225  1.1  christos 
   2226  1.1  christos 	rev	w9, w12                                //CTR block 4
   2227  1.1  christos 	ldp	x6, x7, [x0, #0]           //AES block 0 - load plaintext
   2228  1.1  christos #ifdef __AARCH64EB__
   2229  1.1  christos 	rev	x6, x6
   2230  1.1  christos 	rev	x7, x7
   2231  1.1  christos #endif
   2232  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 4
   2233  1.1  christos 	ldp	x21, x22, [x0, #32]          //AES block 2 - load plaintext
   2234  1.1  christos #ifdef __AARCH64EB__
   2235  1.1  christos 	rev	x21, x21
   2236  1.1  christos 	rev	x22, x22
   2237  1.1  christos #endif
   2238  1.1  christos 	ldp	x23, x24, [x0, #48]          //AES block 3 - load plaintext
   2239  1.1  christos #ifdef __AARCH64EB__
   2240  1.1  christos 	rev	x23, x23
   2241  1.1  christos 	rev	x24, x24
   2242  1.1  christos #endif
   2243  1.1  christos 	ldp	x19, x20, [x0, #16]          //AES block 1 - load plaintext
   2244  1.1  christos #ifdef __AARCH64EB__
   2245  1.1  christos 	rev	x19, x19
   2246  1.1  christos 	rev	x20, x20
   2247  1.1  christos #endif
   2248  1.1  christos 	add	x0, x0, #64                      //AES input_ptr update
   2249  1.1  christos 	cmp	x0, x5                  //check if we have <= 8 blocks
   2250  1.1  christos 
   2251  1.1  christos 	eor	x6, x6, x13                    //AES block 0 - round 12 low
   2252  1.1  christos 
   2253  1.1  christos 	eor	x7, x7, x14                    //AES block 0 - round 12 high
   2254  1.1  christos 	eor	x22, x22, x14                    //AES block 2 - round 12 high
   2255  1.1  christos 	fmov	d4, x6                              //AES block 0 - mov low
   2256  1.1  christos 
   2257  1.1  christos 	eor	x24, x24, x14                    //AES block 3 - round 12 high
   2258  1.1  christos 	fmov	v4.d[1], x7                          //AES block 0 - mov high
   2259  1.1  christos 
   2260  1.1  christos 	eor	x21, x21, x13                    //AES block 2 - round 12 low
   2261  1.1  christos 	eor	x19, x19, x13                    //AES block 1 - round 12 low
   2262  1.1  christos 
   2263  1.1  christos 	fmov	d5, x19                              //AES block 1 - mov low
   2264  1.1  christos 	eor	x20, x20, x14                    //AES block 1 - round 12 high
   2265  1.1  christos 
   2266  1.1  christos 	fmov	v5.d[1], x20                          //AES block 1 - mov high
   2267  1.1  christos 
   2268  1.1  christos 	eor	x23, x23, x13                    //AES block 3 - round 12 low
   2269  1.1  christos 	fmov	d6, x21                              //AES block 2 - mov low
   2270  1.1  christos 
   2271  1.1  christos 	add	w12, w12, #1                           //CTR block 4
   2272  1.1  christos 	eor	v4.16b, v4.16b, v0.16b                         //AES block 0 - result
   2273  1.1  christos 	fmov	d0, x10                              //CTR block 4
   2274  1.1  christos 
   2275  1.1  christos 	fmov	v0.d[1], x9                              //CTR block 4
   2276  1.1  christos 	rev	w9, w12                                //CTR block 5
   2277  1.1  christos 
   2278  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 5
   2279  1.1  christos 	add	w12, w12, #1                           //CTR block 5
   2280  1.1  christos 
   2281  1.1  christos 	fmov	d7, x23                              //AES block 3 - mov low
   2282  1.1  christos 	st1	{ v4.16b}, [x2], #16                    //AES block 0 - store result
   2283  1.1  christos 
   2284  1.1  christos 	fmov	v6.d[1], x22                          //AES block 2 - mov high
   2285  1.1  christos 
   2286  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                         //AES block 1 - result
   2287  1.1  christos 	fmov	d1, x10                              //CTR block 5
   2288  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES block 1 - store result
   2289  1.1  christos 
   2290  1.1  christos 	fmov	v7.d[1], x24                          //AES block 3 - mov high
   2291  1.1  christos 
   2292  1.1  christos 	fmov	v1.d[1], x9                              //CTR block 5
   2293  1.1  christos 	rev	w9, w12                                //CTR block 6
   2294  1.1  christos 
   2295  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 6
   2296  1.1  christos 
   2297  1.1  christos 	add	w12, w12, #1                           //CTR block 6
   2298  1.1  christos 	eor	v6.16b, v6.16b, v2.16b                         //AES block 2 - result
   2299  1.1  christos 	fmov	d2, x10                              //CTR block 6
   2300  1.1  christos 
   2301  1.1  christos 	fmov	v2.d[1], x9                              //CTR block 6
   2302  1.1  christos 	rev	w9, w12                                //CTR block 7
   2303  1.1  christos 
   2304  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 7
   2305  1.1  christos 	st1	{ v6.16b}, [x2], #16                    //AES block 2 - store result
   2306  1.1  christos 
   2307  1.1  christos 	eor	v7.16b, v7.16b, v3.16b                         //AES block 3 - result
   2308  1.1  christos 	st1	{ v7.16b}, [x2], #16                    //AES block 3 - store result
   2309  1.1  christos 	b.ge	.L192_enc_prepretail                             //do prepretail
   2310  1.1  christos 
   2311  1.1  christos .L192_enc_main_loop:	//main	loop start
   2312  1.1  christos 	aese	v2.16b, v18.16b
   2313  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
   2314  1.1  christos 	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
   2315  1.1  christos 
   2316  1.1  christos 	aese	v1.16b, v18.16b
   2317  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
   2318  1.1  christos 	ldp	x19, x20, [x0, #16]          //AES block 4k+5 - load plaintext
   2319  1.1  christos #ifdef __AARCH64EB__
   2320  1.1  christos 	rev	x19, x19
   2321  1.1  christos 	rev	x20, x20
   2322  1.1  christos #endif
   2323  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
   2324  1.1  christos 	fmov	d3, x10                              //CTR block 4k+3
   2325  1.1  christos 	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
   2326  1.1  christos 
   2327  1.1  christos 	aese	v2.16b, v19.16b
   2328  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
   2329  1.1  christos 	fmov	v3.d[1], x9                              //CTR block 4k+3
   2330  1.1  christos 
   2331  1.1  christos 	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
   2332  1.1  christos 	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2333  1.1  christos 	ldp	x21, x22, [x0, #32]          //AES block 4k+6 - load plaintext
   2334  1.1  christos #ifdef __AARCH64EB__
   2335  1.1  christos 	rev	x21, x21
   2336  1.1  christos 	rev	x22, x22
   2337  1.1  christos #endif
   2338  1.1  christos 	aese	v0.16b, v18.16b
   2339  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
   2340  1.1  christos 	ldp	x23, x24, [x0, #48]          //AES block 4k+3 - load plaintext
   2341  1.1  christos #ifdef __AARCH64EB__
   2342  1.1  christos 	rev	x23, x23
   2343  1.1  christos 	rev	x24, x24
   2344  1.1  christos #endif
   2345  1.1  christos 	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
   2346  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
   2347  1.1  christos 
   2348  1.1  christos 	aese	v1.16b, v19.16b
   2349  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
   2350  1.1  christos 
   2351  1.1  christos 	aese	v0.16b, v19.16b
   2352  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
   2353  1.1  christos 	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
   2354  1.1  christos 
   2355  1.1  christos 	aese	v3.16b, v18.16b
   2356  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
   2357  1.1  christos 	eor	x24, x24, x14                    //AES block 4k+3 - round 12 high
   2358  1.1  christos 
   2359  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
   2360  1.1  christos 	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
   2361  1.1  christos 
   2362  1.1  christos 	aese	v0.16b, v20.16b
   2363  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
   2364  1.1  christos 
   2365  1.1  christos 	aese	v3.16b, v19.16b
   2366  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
   2367  1.1  christos 	eor	x21, x21, x13                    //AES block 4k+6 - round 12 low
   2368  1.1  christos 
   2369  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
   2370  1.1  christos 	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
   2371  1.1  christos 
   2372  1.1  christos 	aese	v0.16b, v21.16b
   2373  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
   2374  1.1  christos 	eor	x19, x19, x13                    //AES block 4k+5 - round 12 low
   2375  1.1  christos 
   2376  1.1  christos 	aese	v1.16b, v20.16b
   2377  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
   2378  1.1  christos 	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
   2379  1.1  christos 
   2380  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
   2381  1.1  christos 	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
   2382  1.1  christos 
   2383  1.1  christos 	aese	v2.16b, v20.16b
   2384  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
   2385  1.1  christos 
   2386  1.1  christos 	aese	v1.16b, v21.16b
   2387  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
   2388  1.1  christos 
   2389  1.1  christos 	mov	d10, v17.d[1]                              //GHASH block 4k - mid
   2390  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
   2391  1.1  christos 
   2392  1.1  christos 	aese	v3.16b, v20.16b
   2393  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
   2394  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
   2395  1.1  christos 
   2396  1.1  christos 	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
   2397  1.1  christos 
   2398  1.1  christos 	aese	v0.16b, v22.16b
   2399  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
   2400  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
   2401  1.1  christos 
   2402  1.1  christos 	aese	v3.16b, v21.16b
   2403  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
   2404  1.1  christos 
   2405  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
   2406  1.1  christos 	eor	x20, x20, x14                    //AES block 4k+5 - round 12 high
   2407  1.1  christos 	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
   2408  1.1  christos 
   2409  1.1  christos 	aese	v0.16b, v23.16b
   2410  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
   2411  1.1  christos 	add	w12, w12, #1                           //CTR block 4k+3
   2412  1.1  christos 
   2413  1.1  christos 	aese	v3.16b, v22.16b
   2414  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
   2415  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
   2416  1.1  christos 
   2417  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
   2418  1.1  christos 	eor	x22, x22, x14                    //AES block 4k+6 - round 12 high
   2419  1.1  christos 
   2420  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
   2421  1.1  christos 	eor	x23, x23, x13                    //AES block 4k+3 - round 12 low
   2422  1.1  christos 	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
   2423  1.1  christos 
   2424  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
   2425  1.1  christos 	rev	w9, w12                                //CTR block 4k+8
   2426  1.1  christos 
   2427  1.1  christos 	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
   2428  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 4k+8
   2429  1.1  christos 
   2430  1.1  christos 	aese	v2.16b, v21.16b
   2431  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
   2432  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
   2433  1.1  christos 
   2434  1.1  christos 	aese	v1.16b, v22.16b
   2435  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
   2436  1.1  christos 	ldp	x6, x7, [x0, #0]           //AES block 4k+4 - load plaintext
   2437  1.1  christos #ifdef __AARCH64EB__
   2438  1.1  christos 	rev	x6, x6
   2439  1.1  christos 	rev	x7, x7
   2440  1.1  christos #endif
   2441  1.1  christos 	aese	v0.16b, v24.16b
   2442  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
   2443  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
   2444  1.1  christos 
   2445  1.1  christos 	aese	v2.16b, v22.16b
   2446  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
   2447  1.1  christos 	add	x0, x0, #64                      //AES input_ptr update
   2448  1.1  christos 
   2449  1.1  christos 	aese	v1.16b, v23.16b
   2450  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
   2451  1.1  christos 	movi	v8.8b, #0xc2
   2452  1.1  christos 
   2453  1.1  christos 	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
   2454  1.1  christos 	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
   2455  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
   2456  1.1  christos 
   2457  1.1  christos 	aese	v2.16b, v23.16b
   2458  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
   2459  1.1  christos 	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
   2460  1.1  christos 
   2461  1.1  christos 	aese	v1.16b, v24.16b
   2462  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
   2463  1.1  christos 	shl	d8, d8, #56              //mod_constant
   2464  1.1  christos 
   2465  1.1  christos 	aese	v3.16b, v23.16b
   2466  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
   2467  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
   2468  1.1  christos 
   2469  1.1  christos 	aese	v0.16b, v25.16b
   2470  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
   2471  1.1  christos 	fmov	d5, x19                              //AES block 4k+5 - mov low
   2472  1.1  christos 
   2473  1.1  christos 	aese	v1.16b, v25.16b
   2474  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
   2475  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
   2476  1.1  christos 
   2477  1.1  christos 	aese	v3.16b, v24.16b
   2478  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
   2479  1.1  christos 	fmov	v5.d[1], x20                          //AES block 4k+5 - mov high
   2480  1.1  christos 
   2481  1.1  christos 	aese	v0.16b, v26.16b
   2482  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
   2483  1.1  christos 	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
   2484  1.1  christos 
   2485  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
   2486  1.1  christos 	cmp	x0, x5                  //.LOOP CONTROL
   2487  1.1  christos 	fmov	d4, x6                              //AES block 4k+4 - mov low
   2488  1.1  christos 
   2489  1.1  christos 	aese	v2.16b, v24.16b
   2490  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
   2491  1.1  christos 	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
   2492  1.1  christos 
   2493  1.1  christos 	aese	v1.16b, v26.16b
   2494  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
   2495  1.1  christos 	fmov	d7, x23                              //AES block 4k+3 - mov low
   2496  1.1  christos 
   2497  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
   2498  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
   2499  1.1  christos 	add	w12, w12, #1                           //CTR block 4k+8
   2500  1.1  christos 
   2501  1.1  christos 	aese	v2.16b, v25.16b
   2502  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
   2503  1.1  christos 	fmov	v7.d[1], x24                          //AES block 4k+3 - mov high
   2504  1.1  christos 
   2505  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
   2506  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
   2507  1.1  christos 	fmov	d6, x21                              //AES block 4k+6 - mov low
   2508  1.1  christos 
   2509  1.1  christos 	aese	v3.16b, v25.16b
   2510  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
   2511  1.1  christos 
   2512  1.1  christos 	aese	v0.16b, v27.16b
   2513  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
   2514  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
   2515  1.1  christos 
   2516  1.1  christos 	aese	v2.16b, v26.16b
   2517  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
   2518  1.1  christos 
   2519  1.1  christos 	aese	v3.16b, v26.16b
   2520  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
   2521  1.1  christos 
   2522  1.1  christos 	aese	v1.16b, v27.16b
   2523  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
   2524  1.1  christos 
   2525  1.1  christos 	aese	v0.16b, v28.16b
   2526  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
   2527  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
   2528  1.1  christos 
   2529  1.1  christos 	aese	v3.16b, v27.16b
   2530  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
   2531  1.1  christos 
   2532  1.1  christos 	aese	v2.16b, v27.16b
   2533  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
   2534  1.1  christos 
   2535  1.1  christos 	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
   2536  1.1  christos 
   2537  1.1  christos 	aese	v1.16b, v28.16b
   2538  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
   2539  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
   2540  1.1  christos 
   2541  1.1  christos 	aese	v2.16b, v28.16b
   2542  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
   2543  1.1  christos 
   2544  1.1  christos 	eor	v4.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
   2545  1.1  christos 	fmov	d0, x10                              //CTR block 4k+8
   2546  1.1  christos 
   2547  1.1  christos 	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
   2548  1.1  christos 	fmov	v0.d[1], x9                              //CTR block 4k+8
   2549  1.1  christos 	rev	w9, w12                                //CTR block 4k+9
   2550  1.1  christos 
   2551  1.1  christos 	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
   2552  1.1  christos 	fmov	v6.d[1], x22                          //AES block 4k+6 - mov high
   2553  1.1  christos 	st1	{ v4.16b}, [x2], #16                    //AES block 4k+4 - store result
   2554  1.1  christos 
   2555  1.1  christos 	aese	v3.16b, v28.16b
   2556  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
   2557  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 4k+9
   2558  1.1  christos 
   2559  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                         //AES block 4k+5 - result
   2560  1.1  christos 	add	w12, w12, #1                           //CTR block 4k+9
   2561  1.1  christos 	fmov	d1, x10                              //CTR block 4k+9
   2562  1.1  christos 
   2563  1.1  christos 	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
   2564  1.1  christos 	fmov	v1.d[1], x9                              //CTR block 4k+9
   2565  1.1  christos 	rev	w9, w12                                //CTR block 4k+10
   2566  1.1  christos 
   2567  1.1  christos 	add	w12, w12, #1                           //CTR block 4k+10
   2568  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
   2569  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 4k+10
   2570  1.1  christos 
   2571  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES block 4k+5 - store result
   2572  1.1  christos 	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
   2573  1.1  christos 
   2574  1.1  christos 	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
   2575  1.1  christos 	eor	v6.16b, v6.16b, v2.16b                         //AES block 4k+6 - result
   2576  1.1  christos 	fmov	d2, x10                              //CTR block 4k+10
   2577  1.1  christos 
   2578  1.1  christos 	st1	{ v6.16b}, [x2], #16                    //AES block 4k+6 - store result
   2579  1.1  christos 	fmov	v2.d[1], x9                              //CTR block 4k+10
   2580  1.1  christos 	rev	w9, w12                                //CTR block 4k+11
   2581  1.1  christos 
   2582  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
   2583  1.1  christos 	orr	x9, x11, x9, lsl #32           //CTR block 4k+11
   2584  1.1  christos 
   2585  1.1  christos 	eor	v7.16b, v7.16b, v3.16b                         //AES block 4k+3 - result
   2586  1.1  christos 	st1	{ v7.16b}, [x2], #16                    //AES block 4k+3 - store result
   2587  1.1  christos 	b.lt	.L192_enc_main_loop
   2588  1.1  christos 
   2589  1.1  christos .L192_enc_prepretail:	//PREPRETAIL
   2590  1.1  christos 	aese	v0.16b, v18.16b
   2591  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
   2592  1.1  christos 	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
   2593  1.1  christos 
   2594  1.1  christos 	fmov	d3, x10                              //CTR block 4k+3
   2595  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
   2596  1.1  christos 	add	w12, w12, #1                           //CTR block 4k+3
   2597  1.1  christos 
   2598  1.1  christos 	aese	v1.16b, v18.16b
   2599  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
   2600  1.1  christos 	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
   2601  1.1  christos 
   2602  1.1  christos 	aese	v2.16b, v18.16b
   2603  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
   2604  1.1  christos 
   2605  1.1  christos 	fmov	v3.d[1], x9                              //CTR block 4k+3
   2606  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
   2607  1.1  christos 	mov	d10, v17.d[1]                              //GHASH block 4k - mid
   2608  1.1  christos 
   2609  1.1  christos 	aese	v1.16b, v19.16b
   2610  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
   2611  1.1  christos 	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
   2612  1.1  christos 
   2613  1.1  christos 	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
   2614  1.1  christos 
   2615  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
   2616  1.1  christos 	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
   2617  1.1  christos 
   2618  1.1  christos 	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
   2619  1.1  christos 	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   2620  1.1  christos 
   2621  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
   2622  1.1  christos 
   2623  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
   2624  1.1  christos 	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
   2625  1.1  christos 
   2626  1.1  christos 	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
   2627  1.1  christos 	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
   2628  1.1  christos 
   2629  1.1  christos 	aese	v3.16b, v18.16b
   2630  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
   2631  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
   2632  1.1  christos 
   2633  1.1  christos 	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
   2634  1.1  christos 
   2635  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
   2636  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
   2637  1.1  christos 
   2638  1.1  christos 	aese	v3.16b, v19.16b
   2639  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
   2640  1.1  christos 
   2641  1.1  christos 	aese	v2.16b, v19.16b
   2642  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
   2643  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
   2644  1.1  christos 
   2645  1.1  christos 	aese	v0.16b, v19.16b
   2646  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
   2647  1.1  christos 
   2648  1.1  christos 	aese	v1.16b, v20.16b
   2649  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
   2650  1.1  christos 	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
   2651  1.1  christos 
   2652  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
   2653  1.1  christos 	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
   2654  1.1  christos 
   2655  1.1  christos 	aese	v0.16b, v20.16b
   2656  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
   2657  1.1  christos 
   2658  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
   2659  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
   2660  1.1  christos 
   2661  1.1  christos 	aese	v1.16b, v21.16b
   2662  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
   2663  1.1  christos 
   2664  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
   2665  1.1  christos 
   2666  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
   2667  1.1  christos 
   2668  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
   2669  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
   2670  1.1  christos 
   2671  1.1  christos 	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
   2672  1.1  christos 
   2673  1.1  christos 	aese	v0.16b, v21.16b
   2674  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
   2675  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
   2676  1.1  christos 
   2677  1.1  christos 	aese	v3.16b, v20.16b
   2678  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
   2679  1.1  christos 
   2680  1.1  christos 	aese	v2.16b, v20.16b
   2681  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
   2682  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
   2683  1.1  christos 
   2684  1.1  christos 	aese	v0.16b, v22.16b
   2685  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
   2686  1.1  christos 
   2687  1.1  christos 	aese	v3.16b, v21.16b
   2688  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
   2689  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
   2690  1.1  christos 
   2691  1.1  christos 	aese	v2.16b, v21.16b
   2692  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
   2693  1.1  christos 
   2694  1.1  christos 	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
   2695  1.1  christos 	movi	v8.8b, #0xc2
   2696  1.1  christos 
   2697  1.1  christos 	aese	v3.16b, v22.16b
   2698  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
   2699  1.1  christos 
   2700  1.1  christos 	aese	v2.16b, v22.16b
   2701  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
   2702  1.1  christos 
   2703  1.1  christos 	aese	v1.16b, v22.16b
   2704  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
   2705  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
   2706  1.1  christos 
   2707  1.1  christos 	aese	v3.16b, v23.16b
   2708  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
   2709  1.1  christos 
   2710  1.1  christos 	aese	v2.16b, v23.16b
   2711  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
   2712  1.1  christos 
   2713  1.1  christos 	aese	v1.16b, v23.16b
   2714  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
   2715  1.1  christos 	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
   2716  1.1  christos 
   2717  1.1  christos 	aese	v0.16b, v23.16b
   2718  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
   2719  1.1  christos 
   2720  1.1  christos 	aese	v3.16b, v24.16b
   2721  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
   2722  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                        //karatsuba tidy up
   2723  1.1  christos 
   2724  1.1  christos 	aese	v1.16b, v24.16b
   2725  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
   2726  1.1  christos 
   2727  1.1  christos 	aese	v0.16b, v24.16b
   2728  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
   2729  1.1  christos 	shl	d8, d8, #56              //mod_constant
   2730  1.1  christos 
   2731  1.1  christos 	aese	v3.16b, v25.16b
   2732  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
   2733  1.1  christos 
   2734  1.1  christos 	aese	v1.16b, v25.16b
   2735  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
   2736  1.1  christos 	eor	v10.16b, v10.16b, v11.16b
   2737  1.1  christos 
   2738  1.1  christos 	aese	v0.16b, v25.16b
   2739  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
   2740  1.1  christos 
   2741  1.1  christos 	pmull	v30.1q, v9.1d, v8.1d
   2742  1.1  christos 
   2743  1.1  christos 	aese	v2.16b, v24.16b
   2744  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
   2745  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8
   2746  1.1  christos 
   2747  1.1  christos 	aese	v0.16b, v26.16b
   2748  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
   2749  1.1  christos 
   2750  1.1  christos 	aese	v1.16b, v26.16b
   2751  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
   2752  1.1  christos 	eor	v10.16b, v10.16b, v30.16b
   2753  1.1  christos 
   2754  1.1  christos 	aese	v2.16b, v25.16b
   2755  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
   2756  1.1  christos 
   2757  1.1  christos 	aese	v3.16b, v26.16b
   2758  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
   2759  1.1  christos 
   2760  1.1  christos 	aese	v0.16b, v27.16b
   2761  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
   2762  1.1  christos 
   2763  1.1  christos 	aese	v2.16b, v26.16b
   2764  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
   2765  1.1  christos 	eor	v10.16b, v10.16b, v9.16b
   2766  1.1  christos 
   2767  1.1  christos 	aese	v3.16b, v27.16b
   2768  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
   2769  1.1  christos 
   2770  1.1  christos 	aese	v1.16b, v27.16b
   2771  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
   2772  1.1  christos 
   2773  1.1  christos 	aese	v2.16b, v27.16b
   2774  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
   2775  1.1  christos 
   2776  1.1  christos 	pmull	v30.1q, v10.1d, v8.1d
   2777  1.1  christos 
   2778  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8
   2779  1.1  christos 
   2780  1.1  christos 	aese	v3.16b, v28.16b
   2781  1.1  christos 	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
   2782  1.1  christos 
   2783  1.1  christos 	aese	v0.16b, v28.16b
   2784  1.1  christos 	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
   2785  1.1  christos 
   2786  1.1  christos 	aese	v2.16b, v28.16b
   2787  1.1  christos 	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
   2788  1.1  christos 
   2789  1.1  christos 	aese	v1.16b, v28.16b
   2790  1.1  christos 	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
   2791  1.1  christos 	eor	v11.16b, v11.16b, v30.16b
   2792  1.1  christos 
   2793  1.1  christos 	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
   2794  1.1  christos 
   2795  1.1  christos 	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
   2796  1.1  christos 
   2797  1.1  christos 	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
   2798  1.1  christos 
   2799  1.1  christos 	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
   2800  1.1  christos 	eor	v11.16b, v11.16b, v10.16b
   2801  1.1  christos .L192_enc_tail:	//TAIL
   2802  1.1  christos 
   2803  1.1  christos 	sub	x5, x4, x0  //main_end_input_ptr is number of bytes left to process
   2804  1.1  christos 	ldp	x6, x7, [x0], #16          //AES block 4k+4 - load plaintext
   2805  1.1  christos #ifdef __AARCH64EB__
   2806  1.1  christos 	rev	x6, x6
   2807  1.1  christos 	rev	x7, x7
   2808  1.1  christos #endif
   2809  1.1  christos 	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
   2810  1.1  christos 	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
   2811  1.1  christos 
   2812  1.1  christos 	fmov	d4, x6                              //AES block 4k+4 - mov low
   2813  1.1  christos 
   2814  1.1  christos 	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
   2815  1.1  christos 	cmp	x5, #48
   2816  1.1  christos 
   2817  1.1  christos 	eor	v5.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
   2818  1.1  christos 
   2819  1.1  christos 	ext	v8.16b, v11.16b, v11.16b, #8                    //prepare final partial tag
   2820  1.1  christos 	b.gt	.L192_enc_blocks_more_than_3
   2821  1.1  christos 
   2822  1.1  christos 	sub	w12, w12, #1
   2823  1.1  christos 	movi	v10.8b, #0
   2824  1.1  christos 
   2825  1.1  christos 	mov	v3.16b, v2.16b
   2826  1.1  christos 	movi	v9.8b, #0
   2827  1.1  christos 	cmp	x5, #32
   2828  1.1  christos 
   2829  1.1  christos 	mov	v2.16b, v1.16b
   2830  1.1  christos 	movi	v11.8b, #0
   2831  1.1  christos 	b.gt	.L192_enc_blocks_more_than_2
   2832  1.1  christos 
   2833  1.1  christos 	sub	w12, w12, #1
   2834  1.1  christos 
   2835  1.1  christos 	mov	v3.16b, v1.16b
   2836  1.1  christos 	cmp	x5, #16
   2837  1.1  christos 	b.gt	.L192_enc_blocks_more_than_1
   2838  1.1  christos 
   2839  1.1  christos 	sub	w12, w12, #1
   2840  1.1  christos 	b	.L192_enc_blocks_less_than_1
   2841  1.1  christos .L192_enc_blocks_more_than_3:	//blocks	left >  3
   2842  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
   2843  1.1  christos 
   2844  1.1  christos 	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
   2845  1.1  christos #ifdef __AARCH64EB__
   2846  1.1  christos 	rev	x6, x6
   2847  1.1  christos 	rev	x7, x7
   2848  1.1  christos #endif
   2849  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
   2850  1.1  christos 
   2851  1.1  christos 	eor	x6, x6, x13                    //AES final-2 block - round 12 low
   2852  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2853  1.1  christos 
   2854  1.1  christos 	eor	x7, x7, x14                    //AES final-2 block - round 12 high
   2855  1.1  christos 	fmov	d5, x6                                //AES final-2 block - mov low
   2856  1.1  christos 
   2857  1.1  christos 	fmov	v5.d[1], x7                            //AES final-2 block - mov high
   2858  1.1  christos 
   2859  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
   2860  1.1  christos 
   2861  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
   2862  1.1  christos 
   2863  1.1  christos 	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
   2864  1.1  christos 
   2865  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
   2866  1.1  christos 
   2867  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   2868  1.1  christos 
   2869  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
   2870  1.1  christos 
   2871  1.1  christos 	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
   2872  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
   2873  1.1  christos .L192_enc_blocks_more_than_2:	//blocks	left >  2
   2874  1.1  christos 
   2875  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
   2876  1.1  christos 
   2877  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
   2878  1.1  christos 	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
   2879  1.1  christos #ifdef __AARCH64EB__
   2880  1.1  christos 	rev	x6, x6
   2881  1.1  christos 	rev	x7, x7
   2882  1.1  christos #endif
   2883  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2884  1.1  christos 
   2885  1.1  christos 	eor	x7, x7, x14                    //AES final-1 block - round 12 high
   2886  1.1  christos 
   2887  1.1  christos 	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
   2888  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
   2889  1.1  christos 
   2890  1.1  christos 	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
   2891  1.1  christos 	eor	x6, x6, x13                    //AES final-1 block - round 12 low
   2892  1.1  christos 
   2893  1.1  christos 	fmov	d5, x6                                //AES final-1 block - mov low
   2894  1.1  christos 
   2895  1.1  christos 	fmov	v5.d[1], x7                            //AES final-1 block - mov high
   2896  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
   2897  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
   2898  1.1  christos 
   2899  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
   2900  1.1  christos 
   2901  1.1  christos 	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
   2902  1.1  christos 
   2903  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   2904  1.1  christos 
   2905  1.1  christos 	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
   2906  1.1  christos 
   2907  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
   2908  1.1  christos .L192_enc_blocks_more_than_1:	//blocks	left >  1
   2909  1.1  christos 
   2910  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
   2911  1.1  christos 
   2912  1.1  christos 	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
   2913  1.1  christos #ifdef __AARCH64EB__
   2914  1.1  christos 	rev	x6, x6
   2915  1.1  christos 	rev	x7, x7
   2916  1.1  christos #endif
   2917  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
   2918  1.1  christos 
   2919  1.1  christos 	eor	x6, x6, x13                    //AES final block - round 12 low
   2920  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2921  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   2922  1.1  christos 
   2923  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
   2924  1.1  christos 
   2925  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
   2926  1.1  christos 	eor	x7, x7, x14                    //AES final block - round 12 high
   2927  1.1  christos 	fmov	d5, x6                                //AES final block - mov low
   2928  1.1  christos 
   2929  1.1  christos 	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
   2930  1.1  christos 	fmov	v5.d[1], x7                            //AES final block - mov high
   2931  1.1  christos 
   2932  1.1  christos 	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
   2933  1.1  christos 
   2934  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
   2935  1.1  christos 
   2936  1.1  christos 	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
   2937  1.1  christos 
   2938  1.1  christos 	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
   2939  1.1  christos 
   2940  1.1  christos 	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
   2941  1.1  christos 
   2942  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
   2943  1.1  christos 
   2944  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
   2945  1.1  christos .L192_enc_blocks_less_than_1:	//blocks	left <= 1
   2946  1.1  christos 
   2947  1.1  christos 	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
   2948  1.1  christos #ifndef __AARCH64EB__
   2949  1.1  christos 	rev	w9, w12
   2950  1.1  christos #else
   2951  1.1  christos 	mov	w9, w12
   2952  1.1  christos #endif
   2953  1.1  christos 	and	x1, x1, #127                   //bit_length %= 128
   2954  1.1  christos 
   2955  1.1  christos 	sub	x1, x1, #128                   //bit_length -= 128
   2956  1.1  christos 	mvn	x14, xzr                                     //rk12_h = 0xffffffffffffffff
   2957  1.1  christos 
   2958  1.1  christos 	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
   2959  1.1  christos 	mvn	x13, xzr                                     //rk12_l = 0xffffffffffffffff
   2960  1.1  christos 
   2961  1.1  christos 	and	x1, x1, #127                   //bit_length %= 128
   2962  1.1  christos 
   2963  1.1  christos 	lsr	x14, x14, x1                    //rk12_h is mask for top 64b of last block
   2964  1.1  christos 	cmp	x1, #64
   2965  1.1  christos 
   2966  1.1  christos 	csel	x6, x13, x14, lt
   2967  1.1  christos 	csel	x7, x14, xzr, lt
   2968  1.1  christos 
   2969  1.1  christos 	fmov	d0, x6                                //ctr0b is mask for last block
   2970  1.1  christos 
   2971  1.1  christos 	fmov	v0.d[1], x7
   2972  1.1  christos 
   2973  1.1  christos 	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
   2974  1.1  christos 
   2975  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final block
   2976  1.1  christos 
   2977  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   2978  1.1  christos 
   2979  1.1  christos 	mov	d8, v4.d[1]                                 //GHASH final block - mid
   2980  1.1  christos 
   2981  1.1  christos 	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
   2982  1.1  christos 
   2983  1.1  christos 	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
   2984  1.1  christos 
   2985  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
   2986  1.1  christos 
   2987  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
   2988  1.1  christos 
   2989  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
   2990  1.1  christos 
   2991  1.1  christos 	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
   2992  1.1  christos 
   2993  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
   2994  1.1  christos 	movi	v8.8b, #0xc2
   2995  1.1  christos 
   2996  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
   2997  1.1  christos 
   2998  1.1  christos 	shl	d8, d8, #56              //mod_constant
   2999  1.1  christos 
   3000  1.1  christos 	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
   3001  1.1  christos 
   3002  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
   3003  1.1  christos 
   3004  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
   3005  1.1  christos 
   3006  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
   3007  1.1  christos 
   3008  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
   3009  1.1  christos 
   3010  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
   3011  1.1  christos 
   3012  1.1  christos 	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
   3013  1.1  christos 
   3014  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
   3015  1.1  christos 
   3016  1.1  christos 	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
   3017  1.1  christos 	str	w9, [x16, #12]                         //store the updated counter
   3018  1.1  christos 
   3019  1.1  christos 	st1	{ v5.16b}, [x2]                         //store all 16B
   3020  1.1  christos 
   3021  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
   3022  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   3023  1.1  christos 	rev64	v11.16b, v11.16b
   3024  1.1  christos 	mov	x0, x15
   3025  1.1  christos 	st1	{ v11.16b }, [x3]
   3026  1.1  christos 
   3027  1.1  christos 	ldp	x21, x22, [sp, #16]
   3028  1.1  christos 	ldp	x23, x24, [sp, #32]
   3029  1.1  christos 	ldp	d8, d9, [sp, #48]
   3030  1.1  christos 	ldp	d10, d11, [sp, #64]
   3031  1.1  christos 	ldp	d12, d13, [sp, #80]
   3032  1.1  christos 	ldp	d14, d15, [sp, #96]
   3033  1.1  christos 	ldp	x19, x20, [sp], #112
   3034  1.1  christos 	ret
   3035  1.1  christos 
   3036  1.1  christos .L192_enc_ret:
   3037  1.1  christos 	mov	w0, #0x0
   3038  1.1  christos 	ret
   3039  1.1  christos .size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
   3040  1.1  christos .globl	aes_gcm_dec_192_kernel
   3041  1.1  christos .type	aes_gcm_dec_192_kernel,%function
   3042  1.1  christos .align	4
   3043  1.1  christos aes_gcm_dec_192_kernel:
   3044  1.2  christos 	AARCH64_VALID_CALL_TARGET
   3045  1.1  christos 	cbz	x1, .L192_dec_ret
   3046  1.1  christos 	stp	x19, x20, [sp, #-112]!
   3047  1.1  christos 	mov	x16, x4
   3048  1.1  christos 	mov	x8, x5
   3049  1.1  christos 	stp	x21, x22, [sp, #16]
   3050  1.1  christos 	stp	x23, x24, [sp, #32]
   3051  1.1  christos 	stp	d8, d9, [sp, #48]
   3052  1.1  christos 	stp	d10, d11, [sp, #64]
   3053  1.1  christos 	stp	d12, d13, [sp, #80]
   3054  1.1  christos 	stp	d14, d15, [sp, #96]
   3055  1.1  christos 
   3056  1.1  christos 	add	x4, x0, x1, lsr #3   //end_input_ptr
   3057  1.1  christos 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   3058  1.1  christos #ifdef __AARCH64EB__
   3059  1.1  christos 	rev	x10, x10
   3060  1.1  christos 	rev	x11, x11
   3061  1.1  christos #endif
   3062  1.1  christos 	ldp	x13, x14, [x8, #192]                     //load rk12
   3063  1.1  christos #ifdef __AARCH64EB__
   3064  1.1  christos 	ror	x13, x13, #32
   3065  1.1  christos 	ror	x14, x14, #32
   3066  1.1  christos #endif
   3067  1.1  christos 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   3068  1.1  christos 
   3069  1.1  christos 	ld1	{v18.4s}, [x8], #16                                  //load rk0
   3070  1.1  christos 
   3071  1.1  christos 	lsr	x5, x1, #3              //byte_len
   3072  1.1  christos 	mov	x15, x5
   3073  1.1  christos 	ld1	{v19.4s}, [x8], #16                               //load rk1
   3074  1.1  christos 
   3075  1.1  christos 	lsr	x12, x11, #32
   3076  1.1  christos 	orr	w11, w11, w11
   3077  1.1  christos 	fmov	d3, x10                               //CTR block 3
   3078  1.1  christos 
   3079  1.1  christos 	rev	w12, w12                                //rev_ctr32
   3080  1.1  christos 	fmov	d1, x10                               //CTR block 1
   3081  1.1  christos 
   3082  1.1  christos 	add	w12, w12, #1                            //increment rev_ctr32
   3083  1.1  christos 	ld1	{v20.4s}, [x8], #16                               //load rk2
   3084  1.1  christos 
   3085  1.1  christos 	aese	v0.16b, v18.16b
   3086  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   3087  1.1  christos 	rev	w9, w12                                 //CTR block 1
   3088  1.1  christos 
   3089  1.1  christos 	add	w12, w12, #1                            //CTR block 1
   3090  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 1
   3091  1.1  christos 	ld1	{v21.4s}, [x8], #16                               //load rk3
   3092  1.1  christos 
   3093  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 1
   3094  1.1  christos 	rev	w9, w12                                 //CTR block 2
   3095  1.1  christos 	add	w12, w12, #1                            //CTR block 2
   3096  1.1  christos 
   3097  1.1  christos 	fmov	d2, x10                               //CTR block 2
   3098  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 2
   3099  1.1  christos 
   3100  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 2
   3101  1.1  christos 	rev	w9, w12                                 //CTR block 3
   3102  1.1  christos 
   3103  1.1  christos 	aese	v0.16b, v19.16b
   3104  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   3105  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 3
   3106  1.1  christos 
   3107  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 3
   3108  1.1  christos 
   3109  1.1  christos 	ld1	{v22.4s}, [x8], #16                               //load rk4
   3110  1.1  christos 
   3111  1.1  christos 	aese	v0.16b, v20.16b
   3112  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   3113  1.1  christos 
   3114  1.1  christos 	aese	v2.16b, v18.16b
   3115  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   3116  1.1  christos 	ld1	{v23.4s}, [x8], #16                               //load rk5
   3117  1.1  christos 
   3118  1.1  christos 	aese	v1.16b, v18.16b
   3119  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   3120  1.1  christos 	ldr	q15, [x3, #112]                        //load h4l | h4h
   3121  1.1  christos #ifndef __AARCH64EB__
   3122  1.1  christos 	ext	v15.16b, v15.16b, v15.16b, #8
   3123  1.1  christos #endif
   3124  1.1  christos 	aese	v3.16b, v18.16b
   3125  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   3126  1.1  christos 	ldr	q13, [x3, #64]                         //load h2l | h2h
   3127  1.1  christos #ifndef __AARCH64EB__
   3128  1.1  christos 	ext	v13.16b, v13.16b, v13.16b, #8
   3129  1.1  christos #endif
   3130  1.1  christos 	aese	v2.16b, v19.16b
   3131  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   3132  1.1  christos 	ldr	q14, [x3, #80]                         //load h3l | h3h
   3133  1.1  christos #ifndef __AARCH64EB__
   3134  1.1  christos 	ext	v14.16b, v14.16b, v14.16b, #8
   3135  1.1  christos #endif
   3136  1.1  christos 	aese	v1.16b, v19.16b
   3137  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   3138  1.1  christos 
   3139  1.1  christos 	aese	v3.16b, v19.16b
   3140  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   3141  1.1  christos 	ldr	q12, [x3, #32]                         //load h1l | h1h
   3142  1.1  christos #ifndef __AARCH64EB__
   3143  1.1  christos 	ext	v12.16b, v12.16b, v12.16b, #8
   3144  1.1  christos #endif
   3145  1.1  christos 	aese	v2.16b, v20.16b
   3146  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   3147  1.1  christos 	ld1	{v24.4s}, [x8], #16                               //load rk6
   3148  1.1  christos 
   3149  1.1  christos 	aese	v0.16b, v21.16b
   3150  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   3151  1.1  christos 	ld1	{v25.4s}, [x8], #16                               //load rk7
   3152  1.1  christos 
   3153  1.1  christos 	aese	v1.16b, v20.16b
   3154  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   3155  1.1  christos 	ld1	{v26.4s}, [x8], #16                               //load rk8
   3156  1.1  christos 
   3157  1.1  christos 	aese	v3.16b, v20.16b
   3158  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   3159  1.1  christos 	ld1	{v27.4s}, [x8], #16                               //load rk9
   3160  1.1  christos 
   3161  1.1  christos 	aese	v2.16b, v21.16b
   3162  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   3163  1.1  christos 	ld1	{ v11.16b}, [x3]
   3164  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   3165  1.1  christos 	rev64	v11.16b, v11.16b
   3166  1.1  christos 
   3167  1.1  christos 	aese	v1.16b, v21.16b
   3168  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   3169  1.1  christos 	add	w12, w12, #1                            //CTR block 3
   3170  1.1  christos 
   3171  1.1  christos 	aese	v3.16b, v21.16b
   3172  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   3173  1.1  christos 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   3174  1.1  christos 
   3175  1.1  christos 	aese	v0.16b, v22.16b
   3176  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   3177  1.1  christos 	ld1	{v28.4s}, [x8], #16                              //load rk10
   3178  1.1  christos 
   3179  1.1  christos 	aese	v1.16b, v22.16b
   3180  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   3181  1.1  christos 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   3182  1.1  christos 
   3183  1.1  christos 	aese	v2.16b, v22.16b
   3184  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   3185  1.1  christos 
   3186  1.1  christos 	aese	v3.16b, v22.16b
   3187  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   3188  1.1  christos 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   3189  1.1  christos 
   3190  1.1  christos 	aese	v0.16b, v23.16b
   3191  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   3192  1.1  christos 	ld1	{v29.4s}, [x8], #16                              //load rk11
   3193  1.1  christos 
   3194  1.1  christos 	aese	v1.16b, v23.16b
   3195  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   3196  1.1  christos 
   3197  1.1  christos 	aese	v2.16b, v23.16b
   3198  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   3199  1.1  christos 
   3200  1.1  christos 	aese	v3.16b, v23.16b
   3201  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   3202  1.1  christos 
   3203  1.1  christos 	aese	v0.16b, v24.16b
   3204  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   3205  1.1  christos 
   3206  1.1  christos 	aese	v2.16b, v24.16b
   3207  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   3208  1.1  christos 
   3209  1.1  christos 	aese	v3.16b, v24.16b
   3210  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   3211  1.1  christos 
   3212  1.1  christos 	aese	v0.16b, v25.16b
   3213  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   3214  1.1  christos 
   3215  1.1  christos 	aese	v2.16b, v25.16b
   3216  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   3217  1.1  christos 
   3218  1.1  christos 	aese	v3.16b, v25.16b
   3219  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   3220  1.1  christos 
   3221  1.1  christos 	aese	v1.16b, v24.16b
   3222  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   3223  1.1  christos 
   3224  1.1  christos 	aese	v2.16b, v26.16b
   3225  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   3226  1.1  christos 
   3227  1.1  christos 	aese	v3.16b, v26.16b
   3228  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   3229  1.1  christos 
   3230  1.1  christos 	aese	v1.16b, v25.16b
   3231  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   3232  1.1  christos 
   3233  1.1  christos 	aese	v2.16b, v27.16b
   3234  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
   3235  1.1  christos 
   3236  1.1  christos 	aese	v3.16b, v27.16b
   3237  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
   3238  1.1  christos 
   3239  1.1  christos 	aese	v1.16b, v26.16b
   3240  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   3241  1.1  christos 	sub	x5, x5, #1      //byte_len - 1
   3242  1.1  christos 
   3243  1.1  christos 	aese	v0.16b, v26.16b
   3244  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   3245  1.1  christos 	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   3246  1.1  christos 
   3247  1.1  christos 	aese	v3.16b, v28.16b
   3248  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
   3249  1.1  christos 	add	x5, x5, x0
   3250  1.1  christos 
   3251  1.1  christos 	aese	v1.16b, v27.16b
   3252  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
   3253  1.1  christos 	cmp	x0, x5                   //check if we have <= 4 blocks
   3254  1.1  christos 
   3255  1.1  christos 	aese	v0.16b, v27.16b
   3256  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
   3257  1.1  christos 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   3258  1.1  christos 
   3259  1.1  christos 	aese	v3.16b, v29.16b                                     //AES block 3 - round 11
   3260  1.1  christos 
   3261  1.1  christos 	aese	v2.16b, v28.16b
   3262  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
   3263  1.1  christos 
   3264  1.1  christos 	aese	v1.16b, v28.16b
   3265  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
   3266  1.1  christos 
   3267  1.1  christos 	aese	v0.16b, v28.16b
   3268  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
   3269  1.1  christos 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   3270  1.1  christos 
   3271  1.1  christos 	aese	v2.16b, v29.16b                                     //AES block 2 - round 11
   3272  1.1  christos 
   3273  1.1  christos 	aese	v1.16b, v29.16b                                     //AES block 1 - round 11
   3274  1.1  christos 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   3275  1.1  christos 
   3276  1.1  christos 	aese	v0.16b, v29.16b                                     //AES block 0 - round 11
   3277  1.1  christos 	b.ge	.L192_dec_tail                                    //handle tail
   3278  1.1  christos 
   3279  1.1  christos 	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
   3280  1.1  christos 
   3281  1.1  christos 	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
   3282  1.1  christos 
   3283  1.1  christos 	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
   3284  1.1  christos 	rev	w9, w12                                 //CTR block 4
   3285  1.1  christos 	ld1	{v6.16b, v7.16b}, [x0], #32               //AES block 2,3 - load ciphertext
   3286  1.1  christos 
   3287  1.1  christos 	mov	x19, v1.d[0]                            //AES block 1 - mov low
   3288  1.1  christos 
   3289  1.1  christos 	mov	x20, v1.d[1]                            //AES block 1 - mov high
   3290  1.1  christos 
   3291  1.1  christos 	mov	x6, v0.d[0]                            //AES block 0 - mov low
   3292  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4
   3293  1.1  christos 	add	w12, w12, #1                            //CTR block 4
   3294  1.1  christos 
   3295  1.1  christos 	mov	x7, v0.d[1]                            //AES block 0 - mov high
   3296  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 0
   3297  1.1  christos 
   3298  1.1  christos 	fmov	d0, x10                               //CTR block 4
   3299  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 1
   3300  1.1  christos 	cmp	x0, x5                   //check if we have <= 8 blocks
   3301  1.1  christos 
   3302  1.1  christos 	eor	x19, x19, x13                   //AES block 1 - round 12 low
   3303  1.1  christos #ifdef __AARCH64EB__
   3304  1.1  christos 	rev	x19, x19
   3305  1.1  christos #endif
   3306  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4
   3307  1.1  christos 	rev	w9, w12                                 //CTR block 5
   3308  1.1  christos 
   3309  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 5
   3310  1.1  christos 	fmov	d1, x10                               //CTR block 5
   3311  1.1  christos 	eor	x20, x20, x14                   //AES block 1 - round 12 high
   3312  1.1  christos #ifdef __AARCH64EB__
   3313  1.1  christos 	rev	x20, x20
   3314  1.1  christos #endif
   3315  1.1  christos 	add	w12, w12, #1                            //CTR block 5
   3316  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 5
   3317  1.1  christos 	eor	x6, x6, x13                   //AES block 0 - round 12 low
   3318  1.1  christos #ifdef __AARCH64EB__
   3319  1.1  christos 	rev	x6, x6
   3320  1.1  christos #endif
   3321  1.1  christos 	rev	w9, w12                                 //CTR block 6
   3322  1.1  christos 	eor	x7, x7, x14                   //AES block 0 - round 12 high
   3323  1.1  christos #ifdef __AARCH64EB__
   3324  1.1  christos 	rev	x7, x7
   3325  1.1  christos #endif
   3326  1.1  christos 	stp	x6, x7, [x2], #16        //AES block 0 - store result
   3327  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 6
   3328  1.1  christos 
   3329  1.1  christos 	stp	x19, x20, [x2], #16        //AES block 1 - store result
   3330  1.1  christos 
   3331  1.1  christos 	add	w12, w12, #1                            //CTR block 6
   3332  1.1  christos 	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
   3333  1.1  christos 	b.ge	.L192_dec_prepretail                              //do prepretail
   3334  1.1  christos 
   3335  1.1  christos .L192_dec_main_loop:	//main	loop start
   3336  1.1  christos 	aese	v1.16b, v18.16b
   3337  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   3338  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   3339  1.1  christos 
   3340  1.1  christos 	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   3341  1.1  christos 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   3342  1.1  christos 
   3343  1.1  christos 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   3344  1.1  christos 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   3345  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   3346  1.1  christos 
   3347  1.1  christos 	aese	v1.16b, v19.16b
   3348  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   3349  1.1  christos 	fmov	d2, x10                               //CTR block 4k+6
   3350  1.1  christos 
   3351  1.1  christos 	aese	v0.16b, v18.16b
   3352  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   3353  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   3354  1.1  christos 
   3355  1.1  christos 	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   3356  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+6
   3357  1.1  christos 
   3358  1.1  christos 	aese	v1.16b, v20.16b
   3359  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   3360  1.1  christos 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   3361  1.1  christos 
   3362  1.1  christos 	aese	v0.16b, v19.16b
   3363  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   3364  1.1  christos 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   3365  1.1  christos 
   3366  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   3367  1.1  christos 	fmov	d3, x10                               //CTR block 4k+7
   3368  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   3369  1.1  christos 
   3370  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   3371  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   3372  1.1  christos 	rev	w9, w12                                 //CTR block 4k+7
   3373  1.1  christos 
   3374  1.1  christos 	aese	v2.16b, v18.16b
   3375  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   3376  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   3377  1.1  christos 
   3378  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+7
   3379  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   3380  1.1  christos 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   3381  1.1  christos 
   3382  1.1  christos 	aese	v1.16b, v21.16b
   3383  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   3384  1.1  christos 
   3385  1.1  christos 	aese	v0.16b, v20.16b
   3386  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   3387  1.1  christos 	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
   3388  1.1  christos #ifdef __AARCH64EB__
   3389  1.1  christos 	rev	x22, x22
   3390  1.1  christos #endif
   3391  1.1  christos 	aese	v2.16b, v19.16b
   3392  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   3393  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   3394  1.1  christos 
   3395  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   3396  1.1  christos 
   3397  1.1  christos 	aese	v3.16b, v18.16b
   3398  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   3399  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   3400  1.1  christos 
   3401  1.1  christos 	aese	v2.16b, v20.16b
   3402  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   3403  1.1  christos 
   3404  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   3405  1.1  christos 	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
   3406  1.1  christos 	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
   3407  1.1  christos #ifdef __AARCH64EB__
   3408  1.1  christos 	rev	x21, x21
   3409  1.1  christos #endif
   3410  1.1  christos 	aese	v1.16b, v22.16b
   3411  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   3412  1.1  christos 
   3413  1.1  christos 	aese	v0.16b, v21.16b
   3414  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   3415  1.1  christos 
   3416  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   3417  1.1  christos 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   3418  1.1  christos 
   3419  1.1  christos 	aese	v3.16b, v19.16b
   3420  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   3421  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
   3422  1.1  christos 
   3423  1.1  christos 	aese	v0.16b, v22.16b
   3424  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   3425  1.1  christos 
   3426  1.1  christos 	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   3427  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   3428  1.1  christos 
   3429  1.1  christos 	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   3430  1.1  christos 
   3431  1.1  christos 	aese	v0.16b, v23.16b
   3432  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   3433  1.1  christos 
   3434  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
   3435  1.1  christos 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   3436  1.1  christos 
   3437  1.1  christos 	aese	v1.16b, v23.16b
   3438  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   3439  1.1  christos 
   3440  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   3441  1.1  christos 
   3442  1.1  christos 	aese	v3.16b, v20.16b
   3443  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   3444  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   3445  1.1  christos 
   3446  1.1  christos 	aese	v1.16b, v24.16b
   3447  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   3448  1.1  christos 
   3449  1.1  christos 	aese	v0.16b, v24.16b
   3450  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   3451  1.1  christos 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   3452  1.1  christos 
   3453  1.1  christos 	aese	v3.16b, v21.16b
   3454  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   3455  1.1  christos 
   3456  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   3457  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
   3458  1.1  christos 
   3459  1.1  christos 	aese	v0.16b, v25.16b
   3460  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   3461  1.1  christos 
   3462  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   3463  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   3464  1.1  christos 
   3465  1.1  christos 	aese	v1.16b, v25.16b
   3466  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   3467  1.1  christos 
   3468  1.1  christos 	aese	v0.16b, v26.16b
   3469  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   3470  1.1  christos 	movi	v8.8b, #0xc2
   3471  1.1  christos 
   3472  1.1  christos 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   3473  1.1  christos 
   3474  1.1  christos 	aese	v1.16b, v26.16b
   3475  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   3476  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   3477  1.1  christos 
   3478  1.1  christos 	aese	v2.16b, v21.16b
   3479  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   3480  1.1  christos 
   3481  1.1  christos 	aese	v0.16b, v27.16b
   3482  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   3483  1.1  christos 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   3484  1.1  christos 
   3485  1.1  christos 	aese	v3.16b, v22.16b
   3486  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   3487  1.1  christos 
   3488  1.1  christos 	aese	v2.16b, v22.16b
   3489  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   3490  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   3491  1.1  christos 
   3492  1.1  christos 	aese	v0.16b, v28.16b
   3493  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   3494  1.1  christos 
   3495  1.1  christos 	aese	v1.16b, v27.16b
   3496  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   3497  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   3498  1.1  christos 
   3499  1.1  christos 	aese	v2.16b, v23.16b
   3500  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   3501  1.1  christos 
   3502  1.1  christos 	aese	v3.16b, v23.16b
   3503  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   3504  1.1  christos 	shl	d8, d8, #56               //mod_constant
   3505  1.1  christos 
   3506  1.1  christos 	aese	v1.16b, v28.16b
   3507  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   3508  1.1  christos 
   3509  1.1  christos 	aese	v2.16b, v24.16b
   3510  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   3511  1.1  christos 	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
   3512  1.1  christos 
   3513  1.1  christos 	aese	v3.16b, v24.16b
   3514  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   3515  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   3516  1.1  christos 
   3517  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   3518  1.1  christos 	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
   3519  1.1  christos 	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
   3520  1.1  christos #ifdef __AARCH64EB__
   3521  1.1  christos 	rev	x23, x23
   3522  1.1  christos #endif
   3523  1.1  christos 	aese	v2.16b, v25.16b
   3524  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   3525  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   3526  1.1  christos 
   3527  1.1  christos 	aese	v0.16b, v29.16b                                     //AES block 4k+4 - round 11
   3528  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+7
   3529  1.1  christos 
   3530  1.1  christos 	aese	v3.16b, v25.16b
   3531  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   3532  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   3533  1.1  christos 
   3534  1.1  christos 	aese	v2.16b, v26.16b
   3535  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   3536  1.1  christos 	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
   3537  1.1  christos 
   3538  1.1  christos 	aese	v1.16b, v29.16b                                     //AES block 4k+5 - round 11
   3539  1.1  christos 	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
   3540  1.1  christos 	rev	w9, w12                                 //CTR block 4k+8
   3541  1.1  christos 
   3542  1.1  christos 	aese	v3.16b, v26.16b
   3543  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   3544  1.1  christos 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   3545  1.1  christos 
   3546  1.1  christos 	aese	v2.16b, v27.16b
   3547  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   3548  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   3549  1.1  christos 
   3550  1.1  christos 	cmp	x0, x5                   //.LOOP CONTROL
   3551  1.1  christos 
   3552  1.1  christos 	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
   3553  1.1  christos 	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
   3554  1.1  christos #ifdef __AARCH64EB__
   3555  1.1  christos 	rev	x24, x24
   3556  1.1  christos #endif
   3557  1.1  christos 	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
   3558  1.1  christos 
   3559  1.1  christos 	aese	v2.16b, v28.16b
   3560  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   3561  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   3562  1.1  christos 
   3563  1.1  christos 	aese	v3.16b, v27.16b
   3564  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   3565  1.1  christos 
   3566  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   3567  1.1  christos 	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
   3568  1.1  christos 
   3569  1.1  christos 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   3570  1.1  christos 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   3571  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
   3572  1.1  christos 
   3573  1.1  christos 	aese	v2.16b, v29.16b                                     //AES block 4k+6 - round 11
   3574  1.1  christos 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   3575  1.1  christos 
   3576  1.1  christos 	aese	v3.16b, v28.16b
   3577  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   3578  1.1  christos 	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
   3579  1.1  christos 
   3580  1.1  christos 	fmov	d0, x10                               //CTR block 4k+8
   3581  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+8
   3582  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   3583  1.1  christos 
   3584  1.1  christos 	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
   3585  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4k+8
   3586  1.1  christos 	rev	w9, w12                                 //CTR block 4k+9
   3587  1.1  christos 
   3588  1.1  christos 	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
   3589  1.1  christos #ifdef __AARCH64EB__
   3590  1.1  christos 	rev	x6, x6
   3591  1.1  christos #endif
   3592  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   3593  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   3594  1.1  christos 
   3595  1.1  christos 	fmov	d1, x10                               //CTR block 4k+9
   3596  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+9
   3597  1.1  christos 	eor	x19, x19, x13                   //AES block 4k+5 - round 12 low
   3598  1.1  christos #ifdef __AARCH64EB__
   3599  1.1  christos 	rev	x19, x19
   3600  1.1  christos #endif
   3601  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 4k+9
   3602  1.1  christos 	rev	w9, w12                                 //CTR block 4k+10
   3603  1.1  christos 	eor	x20, x20, x14                   //AES block 4k+5 - round 12 high
   3604  1.1  christos #ifdef __AARCH64EB__
   3605  1.1  christos 	rev	x20, x20
   3606  1.1  christos #endif
   3607  1.1  christos 	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
   3608  1.1  christos #ifdef __AARCH64EB__
   3609  1.1  christos 	rev	x7, x7
   3610  1.1  christos #endif
   3611  1.1  christos 	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
   3612  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   3613  1.1  christos 
   3614  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+10
   3615  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
   3616  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   3617  1.1  christos 
   3618  1.1  christos 	aese	v3.16b, v29.16b                                     //AES block 4k+7 - round 11
   3619  1.1  christos 	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
   3620  1.1  christos 	b.lt	.L192_dec_main_loop
   3621  1.1  christos 
   3622  1.1  christos .L192_dec_prepretail:	//PREPRETAIL
   3623  1.1  christos 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   3624  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   3625  1.1  christos 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   3626  1.1  christos 
   3627  1.1  christos 	aese	v1.16b, v18.16b
   3628  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   3629  1.1  christos 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   3630  1.1  christos 
   3631  1.1  christos 	aese	v0.16b, v18.16b
   3632  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   3633  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   3634  1.1  christos 
   3635  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   3636  1.1  christos 	fmov	d2, x10                               //CTR block 4k+6
   3637  1.1  christos 
   3638  1.1  christos 	aese	v1.16b, v19.16b
   3639  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   3640  1.1  christos 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   3641  1.1  christos 
   3642  1.1  christos 	aese	v0.16b, v19.16b
   3643  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   3644  1.1  christos 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   3645  1.1  christos 
   3646  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   3647  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   3648  1.1  christos 	fmov	d3, x10                               //CTR block 4k+7
   3649  1.1  christos 
   3650  1.1  christos 	aese	v1.16b, v20.16b
   3651  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   3652  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   3653  1.1  christos 
   3654  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   3655  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+6
   3656  1.1  christos 	rev	w9, w12                                 //CTR block 4k+7
   3657  1.1  christos 
   3658  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   3659  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   3660  1.1  christos 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   3661  1.1  christos 
   3662  1.1  christos 	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   3663  1.1  christos 	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
   3664  1.1  christos #ifdef __AARCH64EB__
   3665  1.1  christos 	rev	x24, x24
   3666  1.1  christos #endif
   3667  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+7
   3668  1.1  christos 
   3669  1.1  christos 	aese	v0.16b, v20.16b
   3670  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   3671  1.1  christos 	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
   3672  1.1  christos #ifdef __AARCH64EB__
   3673  1.1  christos 	rev	x21, x21
   3674  1.1  christos #endif
   3675  1.1  christos 	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   3676  1.1  christos 	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
   3677  1.1  christos #ifdef __AARCH64EB__
   3678  1.1  christos 	rev	x22, x22
   3679  1.1  christos #endif
   3680  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   3681  1.1  christos 
   3682  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   3683  1.1  christos 	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
   3684  1.1  christos #ifdef __AARCH64EB__
   3685  1.1  christos 	rev	x23, x23
   3686  1.1  christos #endif
   3687  1.1  christos 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   3688  1.1  christos 
   3689  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   3690  1.1  christos 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   3691  1.1  christos 
   3692  1.1  christos 	aese	v3.16b, v18.16b
   3693  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   3694  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
   3695  1.1  christos 
   3696  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   3697  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+7
   3698  1.1  christos 
   3699  1.1  christos 	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   3700  1.1  christos 	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
   3701  1.1  christos 
   3702  1.1  christos 	aese	v2.16b, v18.16b
   3703  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   3704  1.1  christos 
   3705  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   3706  1.1  christos 	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
   3707  1.1  christos 
   3708  1.1  christos 	aese	v3.16b, v19.16b
   3709  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   3710  1.1  christos 
   3711  1.1  christos 	aese	v2.16b, v19.16b
   3712  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   3713  1.1  christos 	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
   3714  1.1  christos 
   3715  1.1  christos 	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
   3716  1.1  christos 
   3717  1.1  christos 	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   3718  1.1  christos 
   3719  1.1  christos 	aese	v2.16b, v20.16b
   3720  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   3721  1.1  christos 	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
   3722  1.1  christos 
   3723  1.1  christos 	aese	v3.16b, v20.16b
   3724  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   3725  1.1  christos 	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
   3726  1.1  christos 
   3727  1.1  christos 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   3728  1.1  christos 
   3729  1.1  christos 	aese	v0.16b, v21.16b
   3730  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   3731  1.1  christos 	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
   3732  1.1  christos 
   3733  1.1  christos 	aese	v1.16b, v21.16b
   3734  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   3735  1.1  christos 
   3736  1.1  christos 	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
   3737  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
   3738  1.1  christos 
   3739  1.1  christos 	aese	v0.16b, v22.16b
   3740  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   3741  1.1  christos 
   3742  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   3743  1.1  christos 	movi	v8.8b, #0xc2
   3744  1.1  christos 
   3745  1.1  christos 	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
   3746  1.1  christos 
   3747  1.1  christos 	aese	v2.16b, v21.16b
   3748  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   3749  1.1  christos 
   3750  1.1  christos 	shl	d8, d8, #56               //mod_constant
   3751  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   3752  1.1  christos 
   3753  1.1  christos 	aese	v0.16b, v23.16b
   3754  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   3755  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
   3756  1.1  christos 
   3757  1.1  christos 	aese	v2.16b, v22.16b
   3758  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   3759  1.1  christos 
   3760  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   3761  1.1  christos 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   3762  1.1  christos 
   3763  1.1  christos 	aese	v0.16b, v24.16b
   3764  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   3765  1.1  christos 
   3766  1.1  christos 	aese	v3.16b, v21.16b
   3767  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   3768  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
   3769  1.1  christos 
   3770  1.1  christos 	aese	v2.16b, v23.16b
   3771  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   3772  1.1  christos 
   3773  1.1  christos 	aese	v0.16b, v25.16b
   3774  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   3775  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   3776  1.1  christos 
   3777  1.1  christos 	aese	v3.16b, v22.16b
   3778  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   3779  1.1  christos 
   3780  1.1  christos 	aese	v2.16b, v24.16b
   3781  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   3782  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   3783  1.1  christos 
   3784  1.1  christos 	aese	v0.16b, v26.16b
   3785  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   3786  1.1  christos 
   3787  1.1  christos 	aese	v3.16b, v23.16b
   3788  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   3789  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   3790  1.1  christos 
   3791  1.1  christos 	aese	v1.16b, v22.16b
   3792  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   3793  1.1  christos 
   3794  1.1  christos 	aese	v2.16b, v25.16b
   3795  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   3796  1.1  christos 
   3797  1.1  christos 	aese	v0.16b, v27.16b
   3798  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   3799  1.1  christos 
   3800  1.1  christos 	aese	v1.16b, v23.16b
   3801  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   3802  1.1  christos 
   3803  1.1  christos 	aese	v3.16b, v24.16b
   3804  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   3805  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   3806  1.1  christos 
   3807  1.1  christos 	aese	v0.16b, v28.16b
   3808  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   3809  1.1  christos 
   3810  1.1  christos 	aese	v1.16b, v24.16b
   3811  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   3812  1.1  christos 
   3813  1.1  christos 	aese	v3.16b, v25.16b
   3814  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   3815  1.1  christos 
   3816  1.1  christos 	aese	v2.16b, v26.16b
   3817  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   3818  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   3819  1.1  christos 
   3820  1.1  christos 	aese	v1.16b, v25.16b
   3821  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   3822  1.1  christos 
   3823  1.1  christos 	aese	v3.16b, v26.16b
   3824  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   3825  1.1  christos 
   3826  1.1  christos 	aese	v2.16b, v27.16b
   3827  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   3828  1.1  christos 
   3829  1.1  christos 	aese	v1.16b, v26.16b
   3830  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   3831  1.1  christos 
   3832  1.1  christos 	aese	v3.16b, v27.16b
   3833  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   3834  1.1  christos 
   3835  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   3836  1.1  christos 
   3837  1.1  christos 	aese	v1.16b, v27.16b
   3838  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   3839  1.1  christos 
   3840  1.1  christos 	aese	v2.16b, v28.16b
   3841  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   3842  1.1  christos 
   3843  1.1  christos 	aese	v3.16b, v28.16b
   3844  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   3845  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   3846  1.1  christos 
   3847  1.1  christos 	aese	v1.16b, v28.16b
   3848  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   3849  1.1  christos 
   3850  1.1  christos 	aese	v0.16b, v29.16b
   3851  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   3852  1.1  christos 
   3853  1.1  christos 	aese	v2.16b, v29.16b
   3854  1.1  christos 
   3855  1.1  christos 	aese	v1.16b, v29.16b
   3856  1.1  christos 
   3857  1.1  christos 	aese	v3.16b, v29.16b
   3858  1.1  christos 
   3859  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   3860  1.1  christos .L192_dec_tail:	//TAIL
   3861  1.1  christos 
   3862  1.1  christos 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   3863  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
   3864  1.1  christos 
   3865  1.1  christos 	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
   3866  1.1  christos 
   3867  1.1  christos 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   3868  1.1  christos 
   3869  1.1  christos 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   3870  1.1  christos 
   3871  1.1  christos 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   3872  1.1  christos 
   3873  1.1  christos 	cmp	x5, #48
   3874  1.1  christos 
   3875  1.1  christos 	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
   3876  1.1  christos #ifdef __AARCH64EB__
   3877  1.1  christos 	rev	x7, x7
   3878  1.1  christos #endif
   3879  1.1  christos 	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
   3880  1.1  christos #ifdef __AARCH64EB__
   3881  1.1  christos 	rev	x6, x6
   3882  1.1  christos #endif
   3883  1.1  christos 	b.gt	.L192_dec_blocks_more_than_3
   3884  1.1  christos 
   3885  1.1  christos 	movi	v11.8b, #0
   3886  1.1  christos 	movi	v9.8b, #0
   3887  1.1  christos 
   3888  1.1  christos 	mov	v3.16b, v2.16b
   3889  1.1  christos 	mov	v2.16b, v1.16b
   3890  1.1  christos 	sub	w12, w12, #1
   3891  1.1  christos 
   3892  1.1  christos 	movi	v10.8b, #0
   3893  1.1  christos 	cmp	x5, #32
   3894  1.1  christos 	b.gt	.L192_dec_blocks_more_than_2
   3895  1.1  christos 
   3896  1.1  christos 	mov	v3.16b, v1.16b
   3897  1.1  christos 	cmp	x5, #16
   3898  1.1  christos 	sub	w12, w12, #1
   3899  1.1  christos 
   3900  1.1  christos 	b.gt	.L192_dec_blocks_more_than_1
   3901  1.1  christos 
   3902  1.1  christos 	sub	w12, w12, #1
   3903  1.1  christos 	b	.L192_dec_blocks_less_than_1
   3904  1.1  christos .L192_dec_blocks_more_than_3:	//blocks	left >  3
   3905  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
   3906  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
   3907  1.1  christos 
   3908  1.1  christos 	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
   3909  1.1  christos 
   3910  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   3911  1.1  christos 
   3912  1.1  christos 	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
   3913  1.1  christos 
   3914  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
   3915  1.1  christos 	mov	x6, v0.d[0]                            //AES final-2 block - mov low
   3916  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
   3917  1.1  christos 
   3918  1.1  christos 	mov	x7, v0.d[1]                            //AES final-2 block - mov high
   3919  1.1  christos 
   3920  1.1  christos 	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
   3921  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
   3922  1.1  christos 
   3923  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
   3924  1.1  christos 
   3925  1.1  christos 	eor	x6, x6, x13                   //AES final-2 block - round 12 low
   3926  1.1  christos #ifdef __AARCH64EB__
   3927  1.1  christos 	rev	x6, x6
   3928  1.1  christos #endif
   3929  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   3930  1.1  christos 
   3931  1.1  christos 	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
   3932  1.1  christos 	eor	x7, x7, x14                   //AES final-2 block - round 12 high
   3933  1.1  christos #ifdef __AARCH64EB__
   3934  1.1  christos 	rev	x7, x7
   3935  1.1  christos #endif
   3936  1.1  christos .L192_dec_blocks_more_than_2:	//blocks	left >  2
   3937  1.1  christos 
   3938  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
   3939  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
   3940  1.1  christos 
   3941  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   3942  1.1  christos 
   3943  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   3944  1.1  christos 
   3945  1.1  christos 	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
   3946  1.1  christos 
   3947  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
   3948  1.1  christos 
   3949  1.1  christos 	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
   3950  1.1  christos 
   3951  1.1  christos 	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
   3952  1.1  christos 
   3953  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
   3954  1.1  christos 	mov	x7, v0.d[1]                            //AES final-1 block - mov high
   3955  1.1  christos 
   3956  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
   3957  1.1  christos 	mov	x6, v0.d[0]                            //AES final-1 block - mov low
   3958  1.1  christos 
   3959  1.1  christos 	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
   3960  1.1  christos 
   3961  1.1  christos 	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
   3962  1.1  christos 
   3963  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
   3964  1.1  christos 	eor	x7, x7, x14                   //AES final-1 block - round 12 high
   3965  1.1  christos #ifdef __AARCH64EB__
   3966  1.1  christos 	rev	x7, x7
   3967  1.1  christos #endif
   3968  1.1  christos 	eor	x6, x6, x13                   //AES final-1 block - round 12 low
   3969  1.1  christos #ifdef __AARCH64EB__
   3970  1.1  christos 	rev	x6, x6
   3971  1.1  christos #endif
   3972  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
   3973  1.1  christos .L192_dec_blocks_more_than_1:	//blocks	left >  1
   3974  1.1  christos 
   3975  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
   3976  1.1  christos 
   3977  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   3978  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
   3979  1.1  christos 
   3980  1.1  christos 	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
   3981  1.1  christos 
   3982  1.1  christos 	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
   3983  1.1  christos 
   3984  1.1  christos 	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
   3985  1.1  christos 	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
   3986  1.1  christos 
   3987  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
   3988  1.1  christos 
   3989  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
   3990  1.1  christos 
   3991  1.1  christos 	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
   3992  1.1  christos 	mov	x7, v0.d[1]                            //AES final block - mov high
   3993  1.1  christos 
   3994  1.1  christos 	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
   3995  1.1  christos 	mov	x6, v0.d[0]                            //AES final block - mov low
   3996  1.1  christos 
   3997  1.1  christos 	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
   3998  1.1  christos 
   3999  1.1  christos 	movi	v8.8b, #0                                        //suppress further partial tag feed in
   4000  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
   4001  1.1  christos 	eor	x7, x7, x14                   //AES final block - round 12 high
   4002  1.1  christos #ifdef __AARCH64EB__
   4003  1.1  christos 	rev	x7, x7
   4004  1.1  christos #endif
   4005  1.1  christos 	eor	x6, x6, x13                   //AES final block - round 12 low
   4006  1.1  christos #ifdef __AARCH64EB__
   4007  1.1  christos 	rev	x6, x6
   4008  1.1  christos #endif
   4009  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
   4010  1.1  christos .L192_dec_blocks_less_than_1:	//blocks	left <= 1
   4011  1.1  christos 
   4012  1.1  christos 	mvn	x13, xzr                                      //rk12_l = 0xffffffffffffffff
   4013  1.1  christos 	ldp	x4, x5, [x2]  //load existing bytes we need to not overwrite
   4014  1.1  christos 	and	x1, x1, #127                    //bit_length %= 128
   4015  1.1  christos 
   4016  1.1  christos 	sub	x1, x1, #128                    //bit_length -= 128
   4017  1.1  christos 
   4018  1.1  christos 	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
   4019  1.1  christos 
   4020  1.1  christos 	and	x1, x1, #127                    //bit_length %= 128
   4021  1.1  christos 	mvn	x14, xzr                                      //rk12_h = 0xffffffffffffffff
   4022  1.1  christos 
   4023  1.1  christos 	lsr	x14, x14, x1                     //rk12_h is mask for top 64b of last block
   4024  1.1  christos 	cmp	x1, #64
   4025  1.1  christos 
   4026  1.1  christos 	csel	x9, x13, x14, lt
   4027  1.1  christos 	csel	x10, x14, xzr, lt
   4028  1.1  christos 
   4029  1.1  christos 	fmov	d0, x9                                   //ctr0b is mask for last block
   4030  1.1  christos 	and	x6, x6, x9
   4031  1.1  christos 	bic	x4, x4, x9           //mask out low existing bytes
   4032  1.1  christos 
   4033  1.1  christos 	orr	x6, x6, x4
   4034  1.1  christos 	mov	v0.d[1], x10
   4035  1.1  christos #ifndef __AARCH64EB__
   4036  1.1  christos 	rev	w9, w12
   4037  1.1  christos #else
   4038  1.1  christos 	mov	w9, w12
   4039  1.1  christos #endif
   4040  1.1  christos 
   4041  1.1  christos 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
   4042  1.1  christos 	str	w9, [x16, #12]                          //store the updated counter
   4043  1.1  christos 
   4044  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final block
   4045  1.1  christos 
   4046  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   4047  1.1  christos 	bic	x5, x5, x10 //mask out high existing bytes
   4048  1.1  christos 
   4049  1.1  christos 	and	x7, x7, x10
   4050  1.1  christos 
   4051  1.1  christos 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
   4052  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH final block - mid
   4053  1.1  christos 
   4054  1.1  christos 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
   4055  1.1  christos 
   4056  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
   4057  1.1  christos 
   4058  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
   4059  1.1  christos 
   4060  1.1  christos 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
   4061  1.1  christos 
   4062  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
   4063  1.1  christos 
   4064  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
   4065  1.1  christos 	movi	v8.8b, #0xc2
   4066  1.1  christos 
   4067  1.1  christos 	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   4068  1.1  christos 
   4069  1.1  christos 	shl	d8, d8, #56               //mod_constant
   4070  1.1  christos 
   4071  1.1  christos 	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
   4072  1.1  christos 
   4073  1.1  christos 	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   4074  1.1  christos 	orr	x7, x7, x5
   4075  1.1  christos 	stp	x6, x7, [x2]
   4076  1.1  christos 
   4077  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   4078  1.1  christos 
   4079  1.1  christos 	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
   4080  1.1  christos 
   4081  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   4082  1.1  christos 
   4083  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   4084  1.1  christos 
   4085  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   4086  1.1  christos 
   4087  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   4088  1.1  christos 
   4089  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   4090  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   4091  1.1  christos 	rev64	v11.16b, v11.16b
   4092  1.1  christos 	mov	x0, x15
   4093  1.1  christos 	st1	{ v11.16b }, [x3]
   4094  1.1  christos 
   4095  1.1  christos 	ldp	x21, x22, [sp, #16]
   4096  1.1  christos 	ldp	x23, x24, [sp, #32]
   4097  1.1  christos 	ldp	d8, d9, [sp, #48]
   4098  1.1  christos 	ldp	d10, d11, [sp, #64]
   4099  1.1  christos 	ldp	d12, d13, [sp, #80]
   4100  1.1  christos 	ldp	d14, d15, [sp, #96]
   4101  1.1  christos 	ldp	x19, x20, [sp], #112
   4102  1.1  christos 	ret
   4103  1.1  christos 
   4104  1.1  christos .L192_dec_ret:
   4105  1.1  christos 	mov	w0, #0x0
   4106  1.1  christos 	ret
   4107  1.1  christos .size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
   4108  1.1  christos .globl	aes_gcm_enc_256_kernel
   4109  1.1  christos .type	aes_gcm_enc_256_kernel,%function
   4110  1.1  christos .align	4
   4111  1.1  christos aes_gcm_enc_256_kernel:
   4112  1.2  christos 	AARCH64_VALID_CALL_TARGET
   4113  1.1  christos 	cbz	x1, .L256_enc_ret
   4114  1.1  christos 	stp	x19, x20, [sp, #-112]!
   4115  1.1  christos 	mov	x16, x4
   4116  1.1  christos 	mov	x8, x5
   4117  1.1  christos 	stp	x21, x22, [sp, #16]
   4118  1.1  christos 	stp	x23, x24, [sp, #32]
   4119  1.1  christos 	stp	d8, d9, [sp, #48]
   4120  1.1  christos 	stp	d10, d11, [sp, #64]
   4121  1.1  christos 	stp	d12, d13, [sp, #80]
   4122  1.1  christos 	stp	d14, d15, [sp, #96]
   4123  1.1  christos 
   4124  1.1  christos 	add	x4, x0, x1, lsr #3   //end_input_ptr
   4125  1.1  christos 	lsr	x5, x1, #3              //byte_len
   4126  1.1  christos 	mov	x15, x5
   4127  1.1  christos 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   4128  1.1  christos #ifdef __AARCH64EB__
   4129  1.1  christos 	rev	x10, x10
   4130  1.1  christos 	rev	x11, x11
   4131  1.1  christos #endif
   4132  1.1  christos 	ldp	x13, x14, [x8, #224]                     //load rk14
   4133  1.1  christos #ifdef __AARCH64EB__
   4134  1.1  christos 	ror	x13, x13, #32
   4135  1.1  christos 	ror	x14, x14, #32
   4136  1.1  christos #endif
   4137  1.1  christos 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   4138  1.1  christos 	sub	x5, x5, #1      //byte_len - 1
   4139  1.1  christos 
   4140  1.1  christos 	ld1	{v18.4s}, [x8], #16                               //load rk0
   4141  1.1  christos 	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   4142  1.1  christos 
   4143  1.1  christos 	ld1	{v19.4s}, [x8], #16                               //load rk1
   4144  1.1  christos 	add	x5, x5, x0
   4145  1.1  christos 
   4146  1.1  christos 	lsr	x12, x11, #32
   4147  1.1  christos 	fmov	d2, x10                               //CTR block 2
   4148  1.1  christos 	orr	w11, w11, w11
   4149  1.1  christos 
   4150  1.1  christos 	rev	w12, w12                                //rev_ctr32
   4151  1.1  christos 	cmp	x0, x5                   //check if we have <= 4 blocks
   4152  1.1  christos 	fmov	d1, x10                               //CTR block 1
   4153  1.1  christos 
   4154  1.1  christos 	aese	v0.16b, v18.16b
   4155  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   4156  1.1  christos 	add	w12, w12, #1                            //increment rev_ctr32
   4157  1.1  christos 
   4158  1.1  christos 	rev	w9, w12                                 //CTR block 1
   4159  1.1  christos 	fmov	d3, x10                               //CTR block 3
   4160  1.1  christos 
   4161  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 1
   4162  1.1  christos 	add	w12, w12, #1                            //CTR block 1
   4163  1.1  christos 	ld1	{v20.4s}, [x8], #16                               //load rk2
   4164  1.1  christos 
   4165  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 1
   4166  1.1  christos 	rev	w9, w12                                 //CTR block 2
   4167  1.1  christos 	add	w12, w12, #1                            //CTR block 2
   4168  1.1  christos 
   4169  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 2
   4170  1.1  christos 	ld1	{v21.4s}, [x8], #16                               //load rk3
   4171  1.1  christos 
   4172  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 2
   4173  1.1  christos 	rev	w9, w12                                 //CTR block 3
   4174  1.1  christos 
   4175  1.1  christos 	aese	v0.16b, v19.16b
   4176  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   4177  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 3
   4178  1.1  christos 
   4179  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 3
   4180  1.1  christos 
   4181  1.1  christos 	aese	v1.16b, v18.16b
   4182  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   4183  1.1  christos 	ld1	{v22.4s}, [x8], #16                               //load rk4
   4184  1.1  christos 
   4185  1.1  christos 	aese	v0.16b, v20.16b
   4186  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   4187  1.1  christos 	ld1	{v23.4s}, [x8], #16                               //load rk5
   4188  1.1  christos 
   4189  1.1  christos 	aese	v2.16b, v18.16b
   4190  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   4191  1.1  christos 	ld1	{v24.4s}, [x8], #16                               //load rk6
   4192  1.1  christos 
   4193  1.1  christos 	aese	v1.16b, v19.16b
   4194  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   4195  1.1  christos 	ldr	q14, [x3, #80]                         //load h3l | h3h
   4196  1.1  christos #ifndef __AARCH64EB__
   4197  1.1  christos 	ext	v14.16b, v14.16b, v14.16b, #8
   4198  1.1  christos #endif
   4199  1.1  christos 	aese	v3.16b, v18.16b
   4200  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   4201  1.1  christos 	ld1	{v25.4s}, [x8], #16                               //load rk7
   4202  1.1  christos 
   4203  1.1  christos 	aese	v2.16b, v19.16b
   4204  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   4205  1.1  christos 	ld1	{v26.4s}, [x8], #16                               //load rk8
   4206  1.1  christos 
   4207  1.1  christos 	aese	v1.16b, v20.16b
   4208  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   4209  1.1  christos 	ldr	q13, [x3, #64]                         //load h2l | h2h
   4210  1.1  christos #ifndef __AARCH64EB__
   4211  1.1  christos 	ext	v13.16b, v13.16b, v13.16b, #8
   4212  1.1  christos #endif
   4213  1.1  christos 	aese	v3.16b, v19.16b
   4214  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   4215  1.1  christos 	ld1	{v27.4s}, [x8], #16                               //load rk9
   4216  1.1  christos 
   4217  1.1  christos 	aese	v2.16b, v20.16b
   4218  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   4219  1.1  christos 	ldr	q15, [x3, #112]                        //load h4l | h4h
   4220  1.1  christos #ifndef __AARCH64EB__
   4221  1.1  christos 	ext	v15.16b, v15.16b, v15.16b, #8
   4222  1.1  christos #endif
   4223  1.1  christos 	aese	v1.16b, v21.16b
   4224  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   4225  1.1  christos 	ld1	{v28.4s}, [x8], #16                              //load rk10
   4226  1.1  christos 
   4227  1.1  christos 	aese	v3.16b, v20.16b
   4228  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   4229  1.1  christos 	ld1	{v29.4s}, [x8], #16                              //load rk11
   4230  1.1  christos 
   4231  1.1  christos 	aese	v2.16b, v21.16b
   4232  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   4233  1.1  christos 	add	w12, w12, #1                            //CTR block 3
   4234  1.1  christos 
   4235  1.1  christos 	aese	v0.16b, v21.16b
   4236  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   4237  1.1  christos 
   4238  1.1  christos 	aese	v3.16b, v21.16b
   4239  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   4240  1.1  christos 	ld1	{ v11.16b}, [x3]
   4241  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   4242  1.1  christos 	rev64	v11.16b, v11.16b
   4243  1.1  christos 
   4244  1.1  christos 	aese	v2.16b, v22.16b
   4245  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   4246  1.1  christos 
   4247  1.1  christos 	aese	v0.16b, v22.16b
   4248  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   4249  1.1  christos 
   4250  1.1  christos 	aese	v1.16b, v22.16b
   4251  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   4252  1.1  christos 
   4253  1.1  christos 	aese	v3.16b, v22.16b
   4254  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   4255  1.1  christos 
   4256  1.1  christos 	aese	v0.16b, v23.16b
   4257  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   4258  1.1  christos 
   4259  1.1  christos 	aese	v1.16b, v23.16b
   4260  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   4261  1.1  christos 
   4262  1.1  christos 	aese	v3.16b, v23.16b
   4263  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   4264  1.1  christos 
   4265  1.1  christos 	aese	v2.16b, v23.16b
   4266  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   4267  1.1  christos 
   4268  1.1  christos 	aese	v1.16b, v24.16b
   4269  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   4270  1.1  christos 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   4271  1.1  christos 
   4272  1.1  christos 	aese	v3.16b, v24.16b
   4273  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   4274  1.1  christos 	ld1	{v30.4s}, [x8], #16                              //load rk12
   4275  1.1  christos 
   4276  1.1  christos 	aese	v0.16b, v24.16b
   4277  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   4278  1.1  christos 	ldr	q12, [x3, #32]                         //load h1l | h1h
   4279  1.1  christos #ifndef __AARCH64EB__
   4280  1.1  christos 	ext	v12.16b, v12.16b, v12.16b, #8
   4281  1.1  christos #endif
   4282  1.1  christos 	aese	v2.16b, v24.16b
   4283  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   4284  1.1  christos 	ld1	{v31.4s}, [x8], #16                              //load rk13
   4285  1.1  christos 
   4286  1.1  christos 	aese	v1.16b, v25.16b
   4287  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   4288  1.1  christos 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   4289  1.1  christos 
   4290  1.1  christos 	aese	v0.16b, v25.16b
   4291  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   4292  1.1  christos 
   4293  1.1  christos 	aese	v2.16b, v25.16b
   4294  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   4295  1.1  christos 
   4296  1.1  christos 	aese	v3.16b, v25.16b
   4297  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   4298  1.1  christos 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   4299  1.1  christos 
   4300  1.1  christos 	aese	v1.16b, v26.16b
   4301  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   4302  1.1  christos 
   4303  1.1  christos 	aese	v2.16b, v26.16b
   4304  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   4305  1.1  christos 
   4306  1.1  christos 	aese	v3.16b, v26.16b
   4307  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   4308  1.1  christos 
   4309  1.1  christos 	aese	v1.16b, v27.16b
   4310  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
   4311  1.1  christos 
   4312  1.1  christos 	aese	v2.16b, v27.16b
   4313  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
   4314  1.1  christos 
   4315  1.1  christos 	aese	v0.16b, v26.16b
   4316  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   4317  1.1  christos 
   4318  1.1  christos 	aese	v1.16b, v28.16b
   4319  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
   4320  1.1  christos 
   4321  1.1  christos 	aese	v3.16b, v27.16b
   4322  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
   4323  1.1  christos 
   4324  1.1  christos 	aese	v0.16b, v27.16b
   4325  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
   4326  1.1  christos 
   4327  1.1  christos 	aese	v2.16b, v28.16b
   4328  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
   4329  1.1  christos 
   4330  1.1  christos 	aese	v3.16b, v28.16b
   4331  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
   4332  1.1  christos 
   4333  1.1  christos 	aese	v1.16b, v29.16b
   4334  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
   4335  1.1  christos 
   4336  1.1  christos 	aese	v2.16b, v29.16b
   4337  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
   4338  1.1  christos 
   4339  1.1  christos 	aese	v0.16b, v28.16b
   4340  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
   4341  1.1  christos 
   4342  1.1  christos 	aese	v1.16b, v30.16b
   4343  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
   4344  1.1  christos 
   4345  1.1  christos 	aese	v2.16b, v30.16b
   4346  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
   4347  1.1  christos 
   4348  1.1  christos 	aese	v0.16b, v29.16b
   4349  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
   4350  1.1  christos 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   4351  1.1  christos 
   4352  1.1  christos 	aese	v3.16b, v29.16b
   4353  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
   4354  1.1  christos 
   4355  1.1  christos 	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
   4356  1.1  christos 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   4357  1.1  christos 
   4358  1.1  christos 	aese	v0.16b, v30.16b
   4359  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
   4360  1.1  christos 
   4361  1.1  christos 	aese	v3.16b, v30.16b
   4362  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
   4363  1.1  christos 
   4364  1.1  christos 	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
   4365  1.1  christos 
   4366  1.1  christos 	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
   4367  1.1  christos 
   4368  1.1  christos 	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
   4369  1.1  christos 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   4370  1.1  christos 	b.ge	.L256_enc_tail                                    //handle tail
   4371  1.1  christos 
   4372  1.1  christos 	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
   4373  1.1  christos #ifdef __AARCH64EB__
   4374  1.1  christos 	rev	x19, x19
   4375  1.1  christos 	rev	x20, x20
   4376  1.1  christos #endif
   4377  1.1  christos 	rev	w9, w12                                 //CTR block 4
   4378  1.1  christos 	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
   4379  1.1  christos #ifdef __AARCH64EB__
   4380  1.1  christos 	rev	x6, x6
   4381  1.1  christos 	rev	x7, x7
   4382  1.1  christos #endif
   4383  1.1  christos 	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
   4384  1.1  christos #ifdef __AARCH64EB__
   4385  1.1  christos 	rev	x23, x23
   4386  1.1  christos 	rev	x24, x24
   4387  1.1  christos #endif
   4388  1.1  christos 	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
   4389  1.1  christos #ifdef __AARCH64EB__
   4390  1.1  christos 	rev	x21, x21
   4391  1.1  christos 	rev	x22, x22
   4392  1.1  christos #endif
   4393  1.1  christos 	add	x0, x0, #64                       //AES input_ptr update
   4394  1.1  christos 
   4395  1.1  christos 	eor	x19, x19, x13                     //AES block 1 - round 14 low
   4396  1.1  christos 	eor	x20, x20, x14                     //AES block 1 - round 14 high
   4397  1.1  christos 
   4398  1.1  christos 	fmov	d5, x19                               //AES block 1 - mov low
   4399  1.1  christos 	eor	x6, x6, x13                     //AES block 0 - round 14 low
   4400  1.1  christos 
   4401  1.1  christos 	eor	x7, x7, x14                     //AES block 0 - round 14 high
   4402  1.1  christos 	eor	x24, x24, x14                     //AES block 3 - round 14 high
   4403  1.1  christos 	fmov	d4, x6                               //AES block 0 - mov low
   4404  1.1  christos 
   4405  1.1  christos 	cmp	x0, x5                   //check if we have <= 8 blocks
   4406  1.1  christos 	fmov	v4.d[1], x7                           //AES block 0 - mov high
   4407  1.1  christos 	eor	x23, x23, x13                     //AES block 3 - round 14 low
   4408  1.1  christos 
   4409  1.1  christos 	eor	x21, x21, x13                     //AES block 2 - round 14 low
   4410  1.1  christos 	fmov	v5.d[1], x20                           //AES block 1 - mov high
   4411  1.1  christos 
   4412  1.1  christos 	fmov	d6, x21                               //AES block 2 - mov low
   4413  1.1  christos 	add	w12, w12, #1                            //CTR block 4
   4414  1.1  christos 
   4415  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4
   4416  1.1  christos 	fmov	d7, x23                               //AES block 3 - mov low
   4417  1.1  christos 	eor	x22, x22, x14                     //AES block 2 - round 14 high
   4418  1.1  christos 
   4419  1.1  christos 	fmov	v6.d[1], x22                           //AES block 2 - mov high
   4420  1.1  christos 
   4421  1.1  christos 	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
   4422  1.1  christos 	fmov	d0, x10                               //CTR block 4
   4423  1.1  christos 
   4424  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4
   4425  1.1  christos 	rev	w9, w12                                 //CTR block 5
   4426  1.1  christos 	add	w12, w12, #1                            //CTR block 5
   4427  1.1  christos 
   4428  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
   4429  1.1  christos 	fmov	d1, x10                               //CTR block 5
   4430  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 5
   4431  1.1  christos 
   4432  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 5
   4433  1.1  christos 	rev	w9, w12                                 //CTR block 6
   4434  1.1  christos 	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
   4435  1.1  christos 
   4436  1.1  christos 	fmov	v7.d[1], x24                           //AES block 3 - mov high
   4437  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 6
   4438  1.1  christos 	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
   4439  1.1  christos 
   4440  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
   4441  1.1  christos 
   4442  1.1  christos 	add	w12, w12, #1                            //CTR block 6
   4443  1.1  christos 	fmov	d2, x10                               //CTR block 6
   4444  1.1  christos 
   4445  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 6
   4446  1.1  christos 	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
   4447  1.1  christos 	rev	w9, w12                                 //CTR block 7
   4448  1.1  christos 
   4449  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 7
   4450  1.1  christos 
   4451  1.1  christos 	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
   4452  1.1  christos 	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
   4453  1.1  christos 	b.ge	.L256_enc_prepretail                               //do prepretail
   4454  1.1  christos 
   4455  1.1  christos .L256_enc_main_loop:	//main	loop start
   4456  1.1  christos 	aese	v0.16b, v18.16b
   4457  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   4458  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
   4459  1.1  christos 
   4460  1.1  christos 	aese	v1.16b, v18.16b
   4461  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   4462  1.1  christos 	fmov	d3, x10                               //CTR block 4k+3
   4463  1.1  christos 
   4464  1.1  christos 	aese	v2.16b, v18.16b
   4465  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   4466  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   4467  1.1  christos 
   4468  1.1  christos 	aese	v0.16b, v19.16b
   4469  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   4470  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+3
   4471  1.1  christos 
   4472  1.1  christos 	aese	v1.16b, v19.16b
   4473  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   4474  1.1  christos 	ldp	x23, x24, [x0, #48]           //AES block 4k+7 - load plaintext
   4475  1.1  christos #ifdef __AARCH64EB__
   4476  1.1  christos 	rev	x23, x23
   4477  1.1  christos 	rev	x24, x24
   4478  1.1  christos #endif
   4479  1.1  christos 	aese	v2.16b, v19.16b
   4480  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   4481  1.1  christos 	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
   4482  1.1  christos #ifdef __AARCH64EB__
   4483  1.1  christos 	rev	x21, x21
   4484  1.1  christos 	rev	x22, x22
   4485  1.1  christos #endif
   4486  1.1  christos 	aese	v0.16b, v20.16b
   4487  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   4488  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   4489  1.1  christos 
   4490  1.1  christos 	aese	v1.16b, v20.16b
   4491  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   4492  1.1  christos 
   4493  1.1  christos 	aese	v3.16b, v18.16b
   4494  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   4495  1.1  christos 	eor	x23, x23, x13                     //AES block 4k+7 - round 14 low
   4496  1.1  christos 
   4497  1.1  christos 	aese	v0.16b, v21.16b
   4498  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   4499  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   4500  1.1  christos 
   4501  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   4502  1.1  christos 	eor	x22, x22, x14                     //AES block 4k+6 - round 14 high
   4503  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   4504  1.1  christos 
   4505  1.1  christos 	aese	v3.16b, v19.16b
   4506  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   4507  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
   4508  1.1  christos 
   4509  1.1  christos 	aese	v0.16b, v22.16b
   4510  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   4511  1.1  christos 
   4512  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   4513  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   4514  1.1  christos 
   4515  1.1  christos 	aese	v2.16b, v20.16b
   4516  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   4517  1.1  christos 
   4518  1.1  christos 	aese	v0.16b, v23.16b
   4519  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   4520  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4521  1.1  christos 
   4522  1.1  christos 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   4523  1.1  christos 
   4524  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   4525  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
   4526  1.1  christos 
   4527  1.1  christos 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   4528  1.1  christos 
   4529  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   4530  1.1  christos 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   4531  1.1  christos 
   4532  1.1  christos 	aese	v1.16b, v21.16b
   4533  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   4534  1.1  christos 
   4535  1.1  christos 	aese	v3.16b, v20.16b
   4536  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   4537  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   4538  1.1  christos 
   4539  1.1  christos 	aese	v2.16b, v21.16b
   4540  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   4541  1.1  christos 
   4542  1.1  christos 	aese	v1.16b, v22.16b
   4543  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   4544  1.1  christos 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   4545  1.1  christos 
   4546  1.1  christos 	aese	v3.16b, v21.16b
   4547  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   4548  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   4549  1.1  christos 
   4550  1.1  christos 	aese	v2.16b, v22.16b
   4551  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   4552  1.1  christos 
   4553  1.1  christos 	aese	v0.16b, v24.16b
   4554  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   4555  1.1  christos 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   4556  1.1  christos 
   4557  1.1  christos 	aese	v3.16b, v22.16b
   4558  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   4559  1.1  christos 
   4560  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   4561  1.1  christos 
   4562  1.1  christos 	aese	v0.16b, v25.16b
   4563  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   4564  1.1  christos 
   4565  1.1  christos 	aese	v3.16b, v23.16b
   4566  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   4567  1.1  christos 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   4568  1.1  christos 
   4569  1.1  christos 	aese	v1.16b, v23.16b
   4570  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   4571  1.1  christos 
   4572  1.1  christos 	aese	v0.16b, v26.16b
   4573  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   4574  1.1  christos 
   4575  1.1  christos 	aese	v2.16b, v23.16b
   4576  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   4577  1.1  christos 
   4578  1.1  christos 	aese	v1.16b, v24.16b
   4579  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   4580  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   4581  1.1  christos 
   4582  1.1  christos 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   4583  1.1  christos 
   4584  1.1  christos 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   4585  1.1  christos 
   4586  1.1  christos 	aese	v1.16b, v25.16b
   4587  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   4588  1.1  christos 
   4589  1.1  christos 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   4590  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   4591  1.1  christos 
   4592  1.1  christos 	aese	v3.16b, v24.16b
   4593  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   4594  1.1  christos 	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
   4595  1.1  christos #ifdef __AARCH64EB__
   4596  1.1  christos 	rev	x19, x19
   4597  1.1  christos 	rev	x20, x20
   4598  1.1  christos #endif
   4599  1.1  christos 	aese	v1.16b, v26.16b
   4600  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   4601  1.1  christos 	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
   4602  1.1  christos 
   4603  1.1  christos 	aese	v2.16b, v24.16b
   4604  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   4605  1.1  christos 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   4606  1.1  christos 
   4607  1.1  christos 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   4608  1.1  christos 
   4609  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   4610  1.1  christos 	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
   4611  1.1  christos 
   4612  1.1  christos 	aese	v2.16b, v25.16b
   4613  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   4614  1.1  christos 	eor	x19, x19, x13                     //AES block 4k+5 - round 14 low
   4615  1.1  christos 
   4616  1.1  christos 	aese	v1.16b, v27.16b
   4617  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   4618  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   4619  1.1  christos 
   4620  1.1  christos 	aese	v3.16b, v25.16b
   4621  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   4622  1.1  christos 	eor	x21, x21, x13                     //AES block 4k+6 - round 14 low
   4623  1.1  christos 
   4624  1.1  christos 	aese	v0.16b, v27.16b
   4625  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   4626  1.1  christos 	movi	v8.8b, #0xc2
   4627  1.1  christos 
   4628  1.1  christos 	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
   4629  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   4630  1.1  christos 	fmov	d5, x19                               //AES block 4k+5 - mov low
   4631  1.1  christos 
   4632  1.1  christos 	aese	v2.16b, v26.16b
   4633  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   4634  1.1  christos 	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
   4635  1.1  christos #ifdef __AARCH64EB__
   4636  1.1  christos 	rev	x6, x6
   4637  1.1  christos 	rev	x7, x7
   4638  1.1  christos #endif
   4639  1.1  christos 	aese	v0.16b, v28.16b
   4640  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   4641  1.1  christos 	shl	d8, d8, #56               //mod_constant
   4642  1.1  christos 
   4643  1.1  christos 	aese	v3.16b, v26.16b
   4644  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   4645  1.1  christos 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   4646  1.1  christos 
   4647  1.1  christos 	aese	v2.16b, v27.16b
   4648  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   4649  1.1  christos 
   4650  1.1  christos 	aese	v1.16b, v28.16b
   4651  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   4652  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
   4653  1.1  christos 
   4654  1.1  christos 	aese	v3.16b, v27.16b
   4655  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   4656  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+3
   4657  1.1  christos 
   4658  1.1  christos 	aese	v0.16b, v29.16b
   4659  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   4660  1.1  christos 	eor	v4.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   4661  1.1  christos 
   4662  1.1  christos 	aese	v1.16b, v29.16b
   4663  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   4664  1.1  christos 	add	x0, x0, #64                       //AES input_ptr update
   4665  1.1  christos 
   4666  1.1  christos 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   4667  1.1  christos 	rev	w9, w12                                 //CTR block 4k+8
   4668  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   4669  1.1  christos 
   4670  1.1  christos 	aese	v2.16b, v28.16b
   4671  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   4672  1.1  christos 	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
   4673  1.1  christos 
   4674  1.1  christos 	aese	v1.16b, v30.16b
   4675  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   4676  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //MODULO - karatsuba tidy up
   4677  1.1  christos 
   4678  1.1  christos 	aese	v3.16b, v28.16b
   4679  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   4680  1.1  christos 	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
   4681  1.1  christos 
   4682  1.1  christos 	fmov	d4, x6                               //AES block 4k+4 - mov low
   4683  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   4684  1.1  christos 	eor	v7.16b, v9.16b, v7.16b                   //MODULO - fold into mid
   4685  1.1  christos 
   4686  1.1  christos 	aese	v0.16b, v30.16b
   4687  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   4688  1.1  christos 	eor	x20, x20, x14                     //AES block 4k+5 - round 14 high
   4689  1.1  christos 
   4690  1.1  christos 	aese	v2.16b, v29.16b
   4691  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   4692  1.1  christos 	eor	x24, x24, x14                     //AES block 4k+7 - round 14 high
   4693  1.1  christos 
   4694  1.1  christos 	aese	v3.16b, v29.16b
   4695  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   4696  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+8
   4697  1.1  christos 
   4698  1.1  christos 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   4699  1.1  christos 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
   4700  1.1  christos 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   4701  1.1  christos 
   4702  1.1  christos 	aese	v2.16b, v30.16b
   4703  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   4704  1.1  christos 	fmov	d7, x23                               //AES block 4k+7 - mov low
   4705  1.1  christos 
   4706  1.1  christos 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   4707  1.1  christos 	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
   4708  1.1  christos 
   4709  1.1  christos 	fmov	d6, x21                               //AES block 4k+6 - mov low
   4710  1.1  christos 	cmp	x0, x5                   //.LOOP CONTROL
   4711  1.1  christos 
   4712  1.1  christos 	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
   4713  1.1  christos 
   4714  1.1  christos 	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
   4715  1.1  christos 	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
   4716  1.1  christos 	fmov	d0, x10                               //CTR block 4k+8
   4717  1.1  christos 
   4718  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4k+8
   4719  1.1  christos 	rev	w9, w12                                 //CTR block 4k+9
   4720  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+9
   4721  1.1  christos 
   4722  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
   4723  1.1  christos 	fmov	d1, x10                               //CTR block 4k+9
   4724  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   4725  1.1  christos 
   4726  1.1  christos 	aese	v3.16b, v30.16b
   4727  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   4728  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 4k+9
   4729  1.1  christos 
   4730  1.1  christos 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   4731  1.1  christos 	rev	w9, w12                                 //CTR block 4k+10
   4732  1.1  christos 	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
   4733  1.1  christos 
   4734  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   4735  1.1  christos 	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
   4736  1.1  christos 	fmov	v7.d[1], x24                           //AES block 4k+7 - mov high
   4737  1.1  christos 
   4738  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   4739  1.1  christos 	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
   4740  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+10
   4741  1.1  christos 
   4742  1.1  christos 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   4743  1.1  christos 	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
   4744  1.1  christos 	fmov	d2, x10                               //CTR block 4k+10
   4745  1.1  christos 
   4746  1.1  christos 	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
   4747  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+10
   4748  1.1  christos 	rev	w9, w12                                 //CTR block 4k+11
   4749  1.1  christos 
   4750  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   4751  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
   4752  1.1  christos 
   4753  1.1  christos 	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+7 - result
   4754  1.1  christos 	st1	{ v7.16b}, [x2], #16                     //AES block 4k+7 - store result
   4755  1.1  christos 	b.lt	.L256_enc_main_loop
   4756  1.1  christos 
   4757  1.1  christos .L256_enc_prepretail:	//PREPRETAIL
   4758  1.1  christos 	aese	v1.16b, v18.16b
   4759  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   4760  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
   4761  1.1  christos 
   4762  1.1  christos 	aese	v2.16b, v18.16b
   4763  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   4764  1.1  christos 	fmov	d3, x10                               //CTR block 4k+3
   4765  1.1  christos 
   4766  1.1  christos 	aese	v0.16b, v18.16b
   4767  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   4768  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
   4769  1.1  christos 
   4770  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+3
   4771  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   4772  1.1  christos 
   4773  1.1  christos 	aese	v2.16b, v19.16b
   4774  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   4775  1.1  christos 
   4776  1.1  christos 	aese	v0.16b, v19.16b
   4777  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   4778  1.1  christos 
   4779  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   4780  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
   4781  1.1  christos 
   4782  1.1  christos 	aese	v2.16b, v20.16b
   4783  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   4784  1.1  christos 
   4785  1.1  christos 	aese	v3.16b, v18.16b
   4786  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   4787  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   4788  1.1  christos 
   4789  1.1  christos 	aese	v1.16b, v19.16b
   4790  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   4791  1.1  christos 
   4792  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   4793  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   4794  1.1  christos 
   4795  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   4796  1.1  christos 
   4797  1.1  christos 	aese	v2.16b, v21.16b
   4798  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   4799  1.1  christos 
   4800  1.1  christos 	aese	v1.16b, v20.16b
   4801  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   4802  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   4803  1.1  christos 
   4804  1.1  christos 	aese	v0.16b, v20.16b
   4805  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   4806  1.1  christos 
   4807  1.1  christos 	aese	v3.16b, v19.16b
   4808  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   4809  1.1  christos 
   4810  1.1  christos 	aese	v1.16b, v21.16b
   4811  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   4812  1.1  christos 
   4813  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   4814  1.1  christos 
   4815  1.1  christos 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   4816  1.1  christos 
   4817  1.1  christos 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   4818  1.1  christos 
   4819  1.1  christos 	aese	v3.16b, v20.16b
   4820  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   4821  1.1  christos 
   4822  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   4823  1.1  christos 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   4824  1.1  christos 
   4825  1.1  christos 	aese	v0.16b, v21.16b
   4826  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   4827  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   4828  1.1  christos 
   4829  1.1  christos 	aese	v3.16b, v21.16b
   4830  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   4831  1.1  christos 
   4832  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   4833  1.1  christos 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   4834  1.1  christos 
   4835  1.1  christos 	aese	v0.16b, v22.16b
   4836  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   4837  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
   4838  1.1  christos 
   4839  1.1  christos 	aese	v3.16b, v22.16b
   4840  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   4841  1.1  christos 
   4842  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   4843  1.1  christos 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   4844  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+3
   4845  1.1  christos 
   4846  1.1  christos 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   4847  1.1  christos 
   4848  1.1  christos 	aese	v3.16b, v23.16b
   4849  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   4850  1.1  christos 
   4851  1.1  christos 	aese	v2.16b, v22.16b
   4852  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   4853  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   4854  1.1  christos 
   4855  1.1  christos 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   4856  1.1  christos 
   4857  1.1  christos 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   4858  1.1  christos 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   4859  1.1  christos 
   4860  1.1  christos 	aese	v2.16b, v23.16b
   4861  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   4862  1.1  christos 
   4863  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   4864  1.1  christos 	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
   4865  1.1  christos 
   4866  1.1  christos 	aese	v1.16b, v22.16b
   4867  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   4868  1.1  christos 
   4869  1.1  christos 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   4870  1.1  christos 
   4871  1.1  christos 	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
   4872  1.1  christos 
   4873  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   4874  1.1  christos 
   4875  1.1  christos 	aese	v1.16b, v23.16b
   4876  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   4877  1.1  christos 
   4878  1.1  christos 	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
   4879  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   4880  1.1  christos 
   4881  1.1  christos 	aese	v0.16b, v23.16b
   4882  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   4883  1.1  christos 
   4884  1.1  christos 	aese	v1.16b, v24.16b
   4885  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   4886  1.1  christos 
   4887  1.1  christos 	aese	v2.16b, v24.16b
   4888  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   4889  1.1  christos 
   4890  1.1  christos 	aese	v0.16b, v24.16b
   4891  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   4892  1.1  christos 	movi	v8.8b, #0xc2
   4893  1.1  christos 
   4894  1.1  christos 	aese	v3.16b, v24.16b
   4895  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   4896  1.1  christos 
   4897  1.1  christos 	aese	v1.16b, v25.16b
   4898  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   4899  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   4900  1.1  christos 
   4901  1.1  christos 	aese	v0.16b, v25.16b
   4902  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   4903  1.1  christos 
   4904  1.1  christos 	aese	v3.16b, v25.16b
   4905  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   4906  1.1  christos 	shl	d8, d8, #56               //mod_constant
   4907  1.1  christos 
   4908  1.1  christos 	aese	v1.16b, v26.16b
   4909  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   4910  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
   4911  1.1  christos 
   4912  1.1  christos 	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   4913  1.1  christos 
   4914  1.1  christos 	aese	v3.16b, v26.16b
   4915  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   4916  1.1  christos 
   4917  1.1  christos 	aese	v1.16b, v27.16b
   4918  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   4919  1.1  christos 
   4920  1.1  christos 	aese	v0.16b, v26.16b
   4921  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   4922  1.1  christos 	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
   4923  1.1  christos 
   4924  1.1  christos 	aese	v3.16b, v27.16b
   4925  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   4926  1.1  christos 
   4927  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
   4928  1.1  christos 
   4929  1.1  christos 	pmull	v4.1q, v9.1d, v8.1d
   4930  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8
   4931  1.1  christos 
   4932  1.1  christos 	aese	v3.16b, v28.16b
   4933  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   4934  1.1  christos 
   4935  1.1  christos 	aese	v2.16b, v25.16b
   4936  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   4937  1.1  christos 	eor	v10.16b, v10.16b, v11.16b
   4938  1.1  christos 
   4939  1.1  christos 	aese	v1.16b, v28.16b
   4940  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   4941  1.1  christos 
   4942  1.1  christos 	aese	v0.16b, v27.16b
   4943  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   4944  1.1  christos 
   4945  1.1  christos 	aese	v2.16b, v26.16b
   4946  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   4947  1.1  christos 
   4948  1.1  christos 	aese	v1.16b, v29.16b
   4949  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   4950  1.1  christos 	eor	v10.16b, v10.16b, v4.16b
   4951  1.1  christos 
   4952  1.1  christos 	aese	v0.16b, v28.16b
   4953  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   4954  1.1  christos 
   4955  1.1  christos 	aese	v2.16b, v27.16b
   4956  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   4957  1.1  christos 
   4958  1.1  christos 	aese	v1.16b, v30.16b
   4959  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   4960  1.1  christos 
   4961  1.1  christos 	aese	v0.16b, v29.16b
   4962  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   4963  1.1  christos 	eor	v10.16b, v10.16b, v9.16b
   4964  1.1  christos 
   4965  1.1  christos 	aese	v3.16b, v29.16b
   4966  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   4967  1.1  christos 
   4968  1.1  christos 	aese	v2.16b, v28.16b
   4969  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   4970  1.1  christos 
   4971  1.1  christos 	aese	v0.16b, v30.16b
   4972  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   4973  1.1  christos 
   4974  1.1  christos 	pmull	v4.1q, v10.1d, v8.1d
   4975  1.1  christos 
   4976  1.1  christos 	aese	v2.16b, v29.16b
   4977  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   4978  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8
   4979  1.1  christos 
   4980  1.1  christos 	aese	v3.16b, v30.16b
   4981  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   4982  1.1  christos 
   4983  1.1  christos 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   4984  1.1  christos 	eor	v11.16b, v11.16b, v4.16b
   4985  1.1  christos 
   4986  1.1  christos 	aese	v2.16b, v30.16b
   4987  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   4988  1.1  christos 
   4989  1.1  christos 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   4990  1.1  christos 
   4991  1.1  christos 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   4992  1.1  christos 
   4993  1.1  christos 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   4994  1.1  christos 	eor	v11.16b, v11.16b, v10.16b
   4995  1.1  christos .L256_enc_tail:	//TAIL
   4996  1.1  christos 
   4997  1.1  christos 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   4998  1.1  christos 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   4999  1.1  christos 	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
   5000  1.1  christos #ifdef __AARCH64EB__
   5001  1.1  christos 	rev	x6, x6
   5002  1.1  christos 	rev	x7, x7
   5003  1.1  christos #endif
   5004  1.1  christos 	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
   5005  1.1  christos 	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
   5006  1.1  christos 
   5007  1.1  christos 	cmp	x5, #48
   5008  1.1  christos 	fmov	d4, x6                               //AES block 4k+4 - mov low
   5009  1.1  christos 
   5010  1.1  christos 	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
   5011  1.1  christos 
   5012  1.1  christos 	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
   5013  1.1  christos 	b.gt	.L256_enc_blocks_more_than_3
   5014  1.1  christos 
   5015  1.1  christos 	cmp	x5, #32
   5016  1.1  christos 	mov	v3.16b, v2.16b
   5017  1.1  christos 	movi	v11.8b, #0
   5018  1.1  christos 
   5019  1.1  christos 	movi	v9.8b, #0
   5020  1.1  christos 	sub	w12, w12, #1
   5021  1.1  christos 
   5022  1.1  christos 	mov	v2.16b, v1.16b
   5023  1.1  christos 	movi	v10.8b, #0
   5024  1.1  christos 	b.gt	.L256_enc_blocks_more_than_2
   5025  1.1  christos 
   5026  1.1  christos 	mov	v3.16b, v1.16b
   5027  1.1  christos 	sub	w12, w12, #1
   5028  1.1  christos 	cmp	x5, #16
   5029  1.1  christos 
   5030  1.1  christos 	b.gt	.L256_enc_blocks_more_than_1
   5031  1.1  christos 
   5032  1.1  christos 	sub	w12, w12, #1
   5033  1.1  christos 	b	.L256_enc_blocks_less_than_1
   5034  1.1  christos .L256_enc_blocks_more_than_3:	//blocks	left >  3
   5035  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
   5036  1.1  christos 
   5037  1.1  christos 	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
   5038  1.1  christos #ifdef __AARCH64EB__
   5039  1.1  christos 	rev	x6, x6
   5040  1.1  christos 	rev	x7, x7
   5041  1.1  christos #endif
   5042  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
   5043  1.1  christos 
   5044  1.1  christos 	eor	x6, x6, x13                    //AES final-2 block - round 14 low
   5045  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5046  1.1  christos 
   5047  1.1  christos 	eor	x7, x7, x14                    //AES final-2 block - round 14 high
   5048  1.1  christos 
   5049  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
   5050  1.1  christos 	fmov	d5, x6                                //AES final-2 block - mov low
   5051  1.1  christos 
   5052  1.1  christos 	fmov	v5.d[1], x7                            //AES final-2 block - mov high
   5053  1.1  christos 
   5054  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
   5055  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   5056  1.1  christos 
   5057  1.1  christos 	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
   5058  1.1  christos 
   5059  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
   5060  1.1  christos 
   5061  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
   5062  1.1  christos 
   5063  1.1  christos 	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
   5064  1.1  christos 	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
   5065  1.1  christos .L256_enc_blocks_more_than_2:	//blocks	left >  2
   5066  1.1  christos 
   5067  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
   5068  1.1  christos 
   5069  1.1  christos 	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
   5070  1.1  christos #ifdef __AARCH64EB__
   5071  1.1  christos 	rev	x6, x6
   5072  1.1  christos 	rev	x7, x7
   5073  1.1  christos #endif
   5074  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
   5075  1.1  christos 
   5076  1.1  christos 	eor	x6, x6, x13                    //AES final-1 block - round 14 low
   5077  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5078  1.1  christos 
   5079  1.1  christos 	fmov	d5, x6                                //AES final-1 block - mov low
   5080  1.1  christos 	eor	x7, x7, x14                    //AES final-1 block - round 14 high
   5081  1.1  christos 
   5082  1.1  christos 	fmov	v5.d[1], x7                            //AES final-1 block - mov high
   5083  1.1  christos 
   5084  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   5085  1.1  christos 
   5086  1.1  christos 	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
   5087  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
   5088  1.1  christos 
   5089  1.1  christos 	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
   5090  1.1  christos 
   5091  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
   5092  1.1  christos 
   5093  1.1  christos 	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
   5094  1.1  christos 
   5095  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
   5096  1.1  christos 
   5097  1.1  christos 	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
   5098  1.1  christos 
   5099  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
   5100  1.1  christos 
   5101  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
   5102  1.1  christos .L256_enc_blocks_more_than_1:	//blocks	left >  1
   5103  1.1  christos 
   5104  1.1  christos 	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
   5105  1.1  christos 
   5106  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
   5107  1.1  christos 
   5108  1.1  christos 	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
   5109  1.1  christos #ifdef __AARCH64EB__
   5110  1.1  christos 	rev	x6, x6
   5111  1.1  christos 	rev	x7, x7
   5112  1.1  christos #endif
   5113  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5114  1.1  christos 
   5115  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   5116  1.1  christos 
   5117  1.1  christos 	eor	x6, x6, x13                    //AES final block - round 14 low
   5118  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
   5119  1.1  christos 
   5120  1.1  christos 	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
   5121  1.1  christos 	eor	x7, x7, x14                    //AES final block - round 14 high
   5122  1.1  christos 
   5123  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
   5124  1.1  christos 
   5125  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
   5126  1.1  christos 
   5127  1.1  christos 	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
   5128  1.1  christos 	fmov	d5, x6                                //AES final block - mov low
   5129  1.1  christos 
   5130  1.1  christos 	fmov	v5.d[1], x7                            //AES final block - mov high
   5131  1.1  christos 
   5132  1.1  christos 	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
   5133  1.1  christos 
   5134  1.1  christos 	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
   5135  1.1  christos 
   5136  1.1  christos 	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
   5137  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
   5138  1.1  christos 
   5139  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
   5140  1.1  christos .L256_enc_blocks_less_than_1:	//blocks	left <= 1
   5141  1.1  christos 
   5142  1.1  christos 	and	x1, x1, #127                   //bit_length %= 128
   5143  1.1  christos 
   5144  1.1  christos 	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
   5145  1.1  christos 	sub	x1, x1, #128                   //bit_length -= 128
   5146  1.1  christos 
   5147  1.1  christos 	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
   5148  1.1  christos 	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
   5149  1.1  christos 
   5150  1.1  christos 	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
   5151  1.1  christos 	and	x1, x1, #127                   //bit_length %= 128
   5152  1.1  christos 
   5153  1.1  christos 	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
   5154  1.1  christos 	cmp	x1, #64
   5155  1.1  christos 
   5156  1.1  christos 	csel	x6, x13, x14, lt
   5157  1.1  christos 	csel	x7, x14, xzr, lt
   5158  1.1  christos 
   5159  1.1  christos 	fmov	d0, x6                                //ctr0b is mask for last block
   5160  1.1  christos 
   5161  1.1  christos 	fmov	v0.d[1], x7
   5162  1.1  christos 
   5163  1.1  christos 	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
   5164  1.1  christos 
   5165  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final block
   5166  1.1  christos 
   5167  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   5168  1.1  christos 
   5169  1.1  christos 	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
   5170  1.1  christos 
   5171  1.1  christos 	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
   5172  1.1  christos 	mov	d8, v4.d[1]                                 //GHASH final block - mid
   5173  1.1  christos #ifndef __AARCH64EB__
   5174  1.1  christos 	rev	w9, w12
   5175  1.1  christos #else
   5176  1.1  christos 	mov	w9, w12
   5177  1.1  christos #endif
   5178  1.1  christos 
   5179  1.1  christos 	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
   5180  1.1  christos 
   5181  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
   5182  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
   5183  1.1  christos 
   5184  1.1  christos 	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
   5185  1.1  christos 
   5186  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
   5187  1.1  christos 
   5188  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
   5189  1.1  christos 	movi	v8.8b, #0xc2
   5190  1.1  christos 
   5191  1.1  christos 	eor	v4.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
   5192  1.1  christos 
   5193  1.1  christos 	shl	d8, d8, #56              //mod_constant
   5194  1.1  christos 
   5195  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                        //MODULO - karatsuba tidy up
   5196  1.1  christos 
   5197  1.1  christos 	pmull	v7.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
   5198  1.1  christos 
   5199  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
   5200  1.1  christos 
   5201  1.1  christos 	eor	v10.16b, v10.16b, v7.16b                     //MODULO - fold into mid
   5202  1.1  christos 
   5203  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
   5204  1.1  christos 
   5205  1.1  christos 	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
   5206  1.1  christos 
   5207  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
   5208  1.1  christos 
   5209  1.1  christos 	str	w9, [x16, #12]                         //store the updated counter
   5210  1.1  christos 
   5211  1.1  christos 	st1	{ v5.16b}, [x2]                         //store all 16B
   5212  1.1  christos 	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
   5213  1.1  christos 
   5214  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
   5215  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   5216  1.1  christos 	rev64	v11.16b, v11.16b
   5217  1.1  christos 	mov	x0, x15
   5218  1.1  christos 	st1	{ v11.16b }, [x3]
   5219  1.1  christos 
   5220  1.1  christos 	ldp	x21, x22, [sp, #16]
   5221  1.1  christos 	ldp	x23, x24, [sp, #32]
   5222  1.1  christos 	ldp	d8, d9, [sp, #48]
   5223  1.1  christos 	ldp	d10, d11, [sp, #64]
   5224  1.1  christos 	ldp	d12, d13, [sp, #80]
   5225  1.1  christos 	ldp	d14, d15, [sp, #96]
   5226  1.1  christos 	ldp	x19, x20, [sp], #112
   5227  1.1  christos 	ret
   5228  1.1  christos 
   5229  1.1  christos .L256_enc_ret:
   5230  1.1  christos 	mov	w0, #0x0
   5231  1.1  christos 	ret
   5232  1.1  christos .size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
   5233  1.1  christos .globl	aes_gcm_dec_256_kernel
   5234  1.1  christos .type	aes_gcm_dec_256_kernel,%function
   5235  1.1  christos .align	4
   5236  1.1  christos aes_gcm_dec_256_kernel:
   5237  1.2  christos 	AARCH64_VALID_CALL_TARGET
   5238  1.1  christos 	cbz	x1, .L256_dec_ret
   5239  1.1  christos 	stp	x19, x20, [sp, #-112]!
   5240  1.1  christos 	mov	x16, x4
   5241  1.1  christos 	mov	x8, x5
   5242  1.1  christos 	stp	x21, x22, [sp, #16]
   5243  1.1  christos 	stp	x23, x24, [sp, #32]
   5244  1.1  christos 	stp	d8, d9, [sp, #48]
   5245  1.1  christos 	stp	d10, d11, [sp, #64]
   5246  1.1  christos 	stp	d12, d13, [sp, #80]
   5247  1.1  christos 	stp	d14, d15, [sp, #96]
   5248  1.1  christos 
   5249  1.1  christos 	lsr	x5, x1, #3              //byte_len
   5250  1.1  christos 	mov	x15, x5
   5251  1.1  christos 	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
   5252  1.1  christos #ifdef __AARCH64EB__
   5253  1.1  christos 	rev	x10, x10
   5254  1.1  christos 	rev	x11, x11
   5255  1.1  christos #endif
   5256  1.1  christos 	ldp	x13, x14, [x8, #224]                     //load rk14
   5257  1.1  christos #ifdef __AARCH64EB__
   5258  1.1  christos 	ror	x14, x14, #32
   5259  1.1  christos 	ror	x13, x13, #32
   5260  1.1  christos #endif
   5261  1.1  christos 	ld1	{v18.4s}, [x8], #16                               //load rk0
   5262  1.1  christos 	sub	x5, x5, #1      //byte_len - 1
   5263  1.1  christos 
   5264  1.1  christos 	ld1	{v19.4s}, [x8], #16                               //load rk1
   5265  1.1  christos 	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
   5266  1.1  christos 
   5267  1.1  christos 	add	x4, x0, x1, lsr #3   //end_input_ptr
   5268  1.1  christos 	ld1	{v20.4s}, [x8], #16                               //load rk2
   5269  1.1  christos 
   5270  1.1  christos 	lsr	x12, x11, #32
   5271  1.1  christos 	ld1	{v21.4s}, [x8], #16                               //load rk3
   5272  1.1  christos 	orr	w11, w11, w11
   5273  1.1  christos 
   5274  1.1  christos 	ld1	{v22.4s}, [x8], #16                               //load rk4
   5275  1.1  christos 	add	x5, x5, x0
   5276  1.1  christos 	rev	w12, w12                                //rev_ctr32
   5277  1.1  christos 
   5278  1.1  christos 	add	w12, w12, #1                            //increment rev_ctr32
   5279  1.1  christos 	fmov	d3, x10                               //CTR block 3
   5280  1.1  christos 
   5281  1.1  christos 	rev	w9, w12                                 //CTR block 1
   5282  1.1  christos 	add	w12, w12, #1                            //CTR block 1
   5283  1.1  christos 	fmov	d1, x10                               //CTR block 1
   5284  1.1  christos 
   5285  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 1
   5286  1.1  christos 	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
   5287  1.1  christos 
   5288  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 1
   5289  1.1  christos 	rev	w9, w12                                 //CTR block 2
   5290  1.1  christos 	add	w12, w12, #1                            //CTR block 2
   5291  1.1  christos 
   5292  1.1  christos 	fmov	d2, x10                               //CTR block 2
   5293  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 2
   5294  1.1  christos 
   5295  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 2
   5296  1.1  christos 	rev	w9, w12                                 //CTR block 3
   5297  1.1  christos 
   5298  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 3
   5299  1.1  christos 	ld1	{v23.4s}, [x8], #16                               //load rk5
   5300  1.1  christos 
   5301  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 3
   5302  1.1  christos 	add	w12, w12, #1                            //CTR block 3
   5303  1.1  christos 
   5304  1.1  christos 	ld1	{v24.4s}, [x8], #16                               //load rk6
   5305  1.1  christos 
   5306  1.1  christos 	ld1	{v25.4s}, [x8], #16                               //load rk7
   5307  1.1  christos 
   5308  1.1  christos 	ld1	{v26.4s}, [x8], #16                               //load rk8
   5309  1.1  christos 
   5310  1.1  christos 	aese	v0.16b, v18.16b
   5311  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
   5312  1.1  christos 	ldr	q14, [x3, #80]                         //load h3l | h3h
   5313  1.1  christos #ifndef __AARCH64EB__
   5314  1.1  christos 	ext	v14.16b, v14.16b, v14.16b, #8
   5315  1.1  christos #endif
   5316  1.1  christos 
   5317  1.1  christos 	aese	v3.16b, v18.16b
   5318  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
   5319  1.1  christos 	ldr	q15, [x3, #112]                        //load h4l | h4h
   5320  1.1  christos #ifndef __AARCH64EB__
   5321  1.1  christos 	ext	v15.16b, v15.16b, v15.16b, #8
   5322  1.1  christos #endif
   5323  1.1  christos 
   5324  1.1  christos 	aese	v1.16b, v18.16b
   5325  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
   5326  1.1  christos 	ldr	q13, [x3, #64]                         //load h2l | h2h
   5327  1.1  christos #ifndef __AARCH64EB__
   5328  1.1  christos 	ext	v13.16b, v13.16b, v13.16b, #8
   5329  1.1  christos #endif
   5330  1.1  christos 
   5331  1.1  christos 	aese	v2.16b, v18.16b
   5332  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
   5333  1.1  christos 	ld1	{v27.4s}, [x8], #16                                 //load rk9
   5334  1.1  christos 
   5335  1.1  christos 	aese	v0.16b, v19.16b
   5336  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
   5337  1.1  christos 
   5338  1.1  christos 	aese	v1.16b, v19.16b
   5339  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
   5340  1.1  christos 	ld1	{ v11.16b}, [x3]
   5341  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   5342  1.1  christos 	rev64	v11.16b, v11.16b
   5343  1.1  christos 
   5344  1.1  christos 	aese	v2.16b, v19.16b
   5345  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
   5346  1.1  christos 	ld1	{v28.4s}, [x8], #16                              //load rk10
   5347  1.1  christos 
   5348  1.1  christos 	aese	v3.16b, v19.16b
   5349  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
   5350  1.1  christos 	ld1	{v29.4s}, [x8], #16                              //load rk11
   5351  1.1  christos 
   5352  1.1  christos 	aese	v0.16b, v20.16b
   5353  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
   5354  1.1  christos 	ldr	q12, [x3, #32]                         //load h1l | h1h
   5355  1.1  christos #ifndef __AARCH64EB__
   5356  1.1  christos 	ext	v12.16b, v12.16b, v12.16b, #8
   5357  1.1  christos #endif
   5358  1.1  christos 	aese	v2.16b, v20.16b
   5359  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
   5360  1.1  christos 	ld1	{v30.4s}, [x8], #16                              //load rk12
   5361  1.1  christos 
   5362  1.1  christos 	aese	v3.16b, v20.16b
   5363  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
   5364  1.1  christos 
   5365  1.1  christos 	aese	v0.16b, v21.16b
   5366  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
   5367  1.1  christos 
   5368  1.1  christos 	aese	v1.16b, v20.16b
   5369  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
   5370  1.1  christos 
   5371  1.1  christos 	aese	v3.16b, v21.16b
   5372  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
   5373  1.1  christos 
   5374  1.1  christos 	aese	v0.16b, v22.16b
   5375  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
   5376  1.1  christos 	cmp	x0, x5                   //check if we have <= 4 blocks
   5377  1.1  christos 
   5378  1.1  christos 	aese	v2.16b, v21.16b
   5379  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
   5380  1.1  christos 
   5381  1.1  christos 	aese	v1.16b, v21.16b
   5382  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
   5383  1.1  christos 
   5384  1.1  christos 	aese	v3.16b, v22.16b
   5385  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
   5386  1.1  christos 
   5387  1.1  christos 	aese	v2.16b, v22.16b
   5388  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
   5389  1.1  christos 
   5390  1.1  christos 	aese	v1.16b, v22.16b
   5391  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
   5392  1.1  christos 
   5393  1.1  christos 	aese	v3.16b, v23.16b
   5394  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
   5395  1.1  christos 
   5396  1.1  christos 	aese	v0.16b, v23.16b
   5397  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
   5398  1.1  christos 
   5399  1.1  christos 	aese	v1.16b, v23.16b
   5400  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
   5401  1.1  christos 
   5402  1.1  christos 	aese	v2.16b, v23.16b
   5403  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
   5404  1.1  christos 
   5405  1.1  christos 	aese	v0.16b, v24.16b
   5406  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
   5407  1.1  christos 
   5408  1.1  christos 	aese	v3.16b, v24.16b
   5409  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
   5410  1.1  christos 
   5411  1.1  christos 	aese	v1.16b, v24.16b
   5412  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
   5413  1.1  christos 
   5414  1.1  christos 	aese	v2.16b, v24.16b
   5415  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
   5416  1.1  christos 
   5417  1.1  christos 	aese	v0.16b, v25.16b
   5418  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
   5419  1.1  christos 
   5420  1.1  christos 	aese	v1.16b, v25.16b
   5421  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
   5422  1.1  christos 
   5423  1.1  christos 	aese	v3.16b, v25.16b
   5424  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
   5425  1.1  christos 
   5426  1.1  christos 	aese	v0.16b, v26.16b
   5427  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
   5428  1.1  christos 
   5429  1.1  christos 	aese	v2.16b, v25.16b
   5430  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
   5431  1.1  christos 
   5432  1.1  christos 	aese	v3.16b, v26.16b
   5433  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
   5434  1.1  christos 
   5435  1.1  christos 	aese	v1.16b, v26.16b
   5436  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
   5437  1.1  christos 
   5438  1.1  christos 	aese	v0.16b, v27.16b
   5439  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
   5440  1.1  christos 
   5441  1.1  christos 	aese	v2.16b, v26.16b
   5442  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
   5443  1.1  christos 	ld1	{v31.4s}, [x8], #16                             //load rk13
   5444  1.1  christos 
   5445  1.1  christos 	aese	v1.16b, v27.16b
   5446  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
   5447  1.1  christos 
   5448  1.1  christos 	aese	v0.16b, v28.16b
   5449  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
   5450  1.1  christos 
   5451  1.1  christos 	aese	v3.16b, v27.16b
   5452  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
   5453  1.1  christos 
   5454  1.1  christos 	aese	v1.16b, v28.16b
   5455  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
   5456  1.1  christos 
   5457  1.1  christos 	aese	v2.16b, v27.16b
   5458  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
   5459  1.1  christos 
   5460  1.1  christos 	aese	v3.16b, v28.16b
   5461  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
   5462  1.1  christos 
   5463  1.1  christos 	aese	v0.16b, v29.16b
   5464  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
   5465  1.1  christos 
   5466  1.1  christos 	aese	v2.16b, v28.16b
   5467  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
   5468  1.1  christos 
   5469  1.1  christos 	aese	v3.16b, v29.16b
   5470  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
   5471  1.1  christos 
   5472  1.1  christos 	aese	v1.16b, v29.16b
   5473  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
   5474  1.1  christos 
   5475  1.1  christos 	aese	v2.16b, v29.16b
   5476  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
   5477  1.1  christos 
   5478  1.1  christos 	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
   5479  1.1  christos 
   5480  1.1  christos 	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
   5481  1.1  christos 
   5482  1.1  christos 	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
   5483  1.1  christos 	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
   5484  1.1  christos 
   5485  1.1  christos 	aese	v1.16b, v30.16b
   5486  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
   5487  1.1  christos 
   5488  1.1  christos 	aese	v0.16b, v30.16b
   5489  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
   5490  1.1  christos 
   5491  1.1  christos 	aese	v2.16b, v30.16b
   5492  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
   5493  1.1  christos 
   5494  1.1  christos 	aese	v3.16b, v30.16b
   5495  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
   5496  1.1  christos 	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
   5497  1.1  christos 
   5498  1.1  christos 	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
   5499  1.1  christos 
   5500  1.1  christos 	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
   5501  1.1  christos 	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
   5502  1.1  christos 
   5503  1.1  christos 	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
   5504  1.1  christos 
   5505  1.1  christos 	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
   5506  1.1  christos 	b.ge	.L256_dec_tail                                    //handle tail
   5507  1.1  christos 
   5508  1.1  christos 	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
   5509  1.1  christos 
   5510  1.1  christos 	rev	w9, w12                                 //CTR block 4
   5511  1.1  christos 
   5512  1.1  christos 	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
   5513  1.1  christos 
   5514  1.1  christos 	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
   5515  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 1
   5516  1.1  christos 	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
   5517  1.1  christos 
   5518  1.1  christos 	mov	x7, v0.d[1]                            //AES block 0 - mov high
   5519  1.1  christos 
   5520  1.1  christos 	mov	x6, v0.d[0]                            //AES block 0 - mov low
   5521  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 0
   5522  1.1  christos 	add	w12, w12, #1                            //CTR block 4
   5523  1.1  christos 
   5524  1.1  christos 	fmov	d0, x10                               //CTR block 4
   5525  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4
   5526  1.1  christos 
   5527  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4
   5528  1.1  christos 	rev	w9, w12                                 //CTR block 5
   5529  1.1  christos 	add	w12, w12, #1                            //CTR block 5
   5530  1.1  christos 
   5531  1.1  christos 	mov	x19, v1.d[0]                            //AES block 1 - mov low
   5532  1.1  christos 
   5533  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 5
   5534  1.1  christos 	mov	x20, v1.d[1]                            //AES block 1 - mov high
   5535  1.1  christos 	eor	x7, x7, x14                   //AES block 0 - round 14 high
   5536  1.1  christos #ifdef __AARCH64EB__
   5537  1.1  christos 	rev	x7, x7
   5538  1.1  christos #endif
   5539  1.1  christos 	eor	x6, x6, x13                   //AES block 0 - round 14 low
   5540  1.1  christos #ifdef __AARCH64EB__
   5541  1.1  christos 	rev	x6, x6
   5542  1.1  christos #endif
   5543  1.1  christos 	stp	x6, x7, [x2], #16        //AES block 0 - store result
   5544  1.1  christos 	fmov	d1, x10                               //CTR block 5
   5545  1.1  christos 
   5546  1.1  christos 	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
   5547  1.1  christos 
   5548  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 5
   5549  1.1  christos 	rev	w9, w12                                 //CTR block 6
   5550  1.1  christos 	add	w12, w12, #1                            //CTR block 6
   5551  1.1  christos 
   5552  1.1  christos 	eor	x19, x19, x13                   //AES block 1 - round 14 low
   5553  1.1  christos #ifdef __AARCH64EB__
   5554  1.1  christos 	rev	x19, x19
   5555  1.1  christos #endif
   5556  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 6
   5557  1.1  christos 
   5558  1.1  christos 	eor	x20, x20, x14                   //AES block 1 - round 14 high
   5559  1.1  christos #ifdef __AARCH64EB__
   5560  1.1  christos 	rev	x20, x20
   5561  1.1  christos #endif
   5562  1.1  christos 	stp	x19, x20, [x2], #16        //AES block 1 - store result
   5563  1.1  christos 
   5564  1.1  christos 	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
   5565  1.1  christos 	cmp	x0, x5                   //check if we have <= 8 blocks
   5566  1.1  christos 	b.ge	.L256_dec_prepretail                              //do prepretail
   5567  1.1  christos 
   5568  1.1  christos .L256_dec_main_loop:	//main	loop start
   5569  1.1  christos 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   5570  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   5571  1.1  christos 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   5572  1.1  christos 
   5573  1.1  christos 	aese	v0.16b, v18.16b
   5574  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   5575  1.1  christos 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   5576  1.1  christos 
   5577  1.1  christos 	aese	v1.16b, v18.16b
   5578  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   5579  1.1  christos 	fmov	d2, x10                               //CTR block 4k+6
   5580  1.1  christos 
   5581  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+6
   5582  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   5583  1.1  christos 	rev	w9, w12                                 //CTR block 4k+7
   5584  1.1  christos 
   5585  1.1  christos 	aese	v0.16b, v19.16b
   5586  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   5587  1.1  christos 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   5588  1.1  christos 
   5589  1.1  christos 	aese	v1.16b, v19.16b
   5590  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   5591  1.1  christos 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   5592  1.1  christos 
   5593  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   5594  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   5595  1.1  christos 	fmov	d3, x10                               //CTR block 4k+7
   5596  1.1  christos 
   5597  1.1  christos 	aese	v0.16b, v20.16b
   5598  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   5599  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   5600  1.1  christos 
   5601  1.1  christos 	aese	v2.16b, v18.16b
   5602  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   5603  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+7
   5604  1.1  christos 
   5605  1.1  christos 	aese	v1.16b, v20.16b
   5606  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   5607  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   5608  1.1  christos 
   5609  1.1  christos 	aese	v0.16b, v21.16b
   5610  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   5611  1.1  christos 	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
   5612  1.1  christos #ifdef __AARCH64EB__
   5613  1.1  christos 	rev	x22, x22
   5614  1.1  christos #endif
   5615  1.1  christos 	aese	v2.16b, v19.16b
   5616  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   5617  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   5618  1.1  christos 
   5619  1.1  christos 	aese	v1.16b, v21.16b
   5620  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   5621  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   5622  1.1  christos 
   5623  1.1  christos 	aese	v3.16b, v18.16b
   5624  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   5625  1.1  christos 	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
   5626  1.1  christos #ifdef __AARCH64EB__
   5627  1.1  christos 	rev	x21, x21
   5628  1.1  christos #endif
   5629  1.1  christos 	aese	v2.16b, v20.16b
   5630  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   5631  1.1  christos 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   5632  1.1  christos 
   5633  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   5634  1.1  christos 
   5635  1.1  christos 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   5636  1.1  christos 
   5637  1.1  christos 	aese	v2.16b, v21.16b
   5638  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   5639  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   5640  1.1  christos 
   5641  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   5642  1.1  christos 	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
   5643  1.1  christos #ifdef __AARCH64EB__
   5644  1.1  christos 	rev	x23, x23
   5645  1.1  christos #endif
   5646  1.1  christos 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   5647  1.1  christos 	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
   5648  1.1  christos #ifdef __AARCH64EB__
   5649  1.1  christos 	rev	x24, x24
   5650  1.1  christos #endif
   5651  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   5652  1.1  christos 
   5653  1.1  christos 	aese	v2.16b, v22.16b
   5654  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   5655  1.1  christos 
   5656  1.1  christos 	aese	v3.16b, v19.16b
   5657  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   5658  1.1  christos 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   5659  1.1  christos 
   5660  1.1  christos 	aese	v0.16b, v22.16b
   5661  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   5662  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   5663  1.1  christos 
   5664  1.1  christos 	aese	v2.16b, v23.16b
   5665  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   5666  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+7
   5667  1.1  christos 
   5668  1.1  christos 	aese	v3.16b, v20.16b
   5669  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   5670  1.1  christos 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   5671  1.1  christos 
   5672  1.1  christos 	aese	v1.16b, v22.16b
   5673  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   5674  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   5675  1.1  christos 
   5676  1.1  christos 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   5677  1.1  christos 
   5678  1.1  christos 	aese	v3.16b, v21.16b
   5679  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   5680  1.1  christos 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   5681  1.1  christos 
   5682  1.1  christos 	aese	v1.16b, v23.16b
   5683  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   5684  1.1  christos 
   5685  1.1  christos 	aese	v0.16b, v23.16b
   5686  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   5687  1.1  christos 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   5688  1.1  christos 
   5689  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   5690  1.1  christos 	rev	w9, w12                                 //CTR block 4k+8
   5691  1.1  christos 
   5692  1.1  christos 	aese	v1.16b, v24.16b
   5693  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   5694  1.1  christos 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   5695  1.1  christos 
   5696  1.1  christos 	aese	v0.16b, v24.16b
   5697  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   5698  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+8
   5699  1.1  christos 
   5700  1.1  christos 	aese	v3.16b, v22.16b
   5701  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   5702  1.1  christos 
   5703  1.1  christos 	aese	v1.16b, v25.16b
   5704  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   5705  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   5706  1.1  christos 
   5707  1.1  christos 	aese	v0.16b, v25.16b
   5708  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   5709  1.1  christos 
   5710  1.1  christos 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   5711  1.1  christos 	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
   5712  1.1  christos 
   5713  1.1  christos 	aese	v3.16b, v23.16b
   5714  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   5715  1.1  christos 
   5716  1.1  christos 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   5717  1.1  christos 
   5718  1.1  christos 	aese	v0.16b, v26.16b
   5719  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   5720  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   5721  1.1  christos 
   5722  1.1  christos 	aese	v3.16b, v24.16b
   5723  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   5724  1.1  christos 
   5725  1.1  christos 	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   5726  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
   5727  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   5728  1.1  christos 
   5729  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   5730  1.1  christos 
   5731  1.1  christos 	aese	v0.16b, v27.16b
   5732  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   5733  1.1  christos 	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
   5734  1.1  christos 
   5735  1.1  christos 	aese	v1.16b, v26.16b
   5736  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   5737  1.1  christos 
   5738  1.1  christos 	aese	v2.16b, v24.16b
   5739  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   5740  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   5741  1.1  christos 
   5742  1.1  christos 	aese	v0.16b, v28.16b
   5743  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   5744  1.1  christos 
   5745  1.1  christos 	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
   5746  1.1  christos 	movi	v8.8b, #0xc2
   5747  1.1  christos 
   5748  1.1  christos 	aese	v2.16b, v25.16b
   5749  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   5750  1.1  christos 	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
   5751  1.1  christos 
   5752  1.1  christos 	aese	v0.16b, v29.16b
   5753  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   5754  1.1  christos 
   5755  1.1  christos 	aese	v3.16b, v25.16b
   5756  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   5757  1.1  christos 	shl	d8, d8, #56               //mod_constant
   5758  1.1  christos 
   5759  1.1  christos 	aese	v2.16b, v26.16b
   5760  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   5761  1.1  christos 	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
   5762  1.1  christos 
   5763  1.1  christos 	aese	v0.16b, v30.16b
   5764  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   5765  1.1  christos 
   5766  1.1  christos 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   5767  1.1  christos 	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   5768  1.1  christos 
   5769  1.1  christos 	aese	v1.16b, v27.16b
   5770  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   5771  1.1  christos 	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
   5772  1.1  christos 
   5773  1.1  christos 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   5774  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   5775  1.1  christos 
   5776  1.1  christos 	aese	v1.16b, v28.16b
   5777  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   5778  1.1  christos 	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
   5779  1.1  christos 
   5780  1.1  christos 	aese	v2.16b, v27.16b
   5781  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   5782  1.1  christos 	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
   5783  1.1  christos 
   5784  1.1  christos 	aese	v3.16b, v26.16b
   5785  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   5786  1.1  christos 	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
   5787  1.1  christos 
   5788  1.1  christos 	aese	v1.16b, v29.16b
   5789  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   5790  1.1  christos 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   5791  1.1  christos 
   5792  1.1  christos 	aese	v2.16b, v28.16b
   5793  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   5794  1.1  christos 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   5795  1.1  christos 
   5796  1.1  christos 	aese	v3.16b, v27.16b
   5797  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   5798  1.1  christos 	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
   5799  1.1  christos 
   5800  1.1  christos 	aese	v1.16b, v30.16b
   5801  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   5802  1.1  christos 	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
   5803  1.1  christos 
   5804  1.1  christos 	aese	v2.16b, v29.16b
   5805  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   5806  1.1  christos 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   5807  1.1  christos 
   5808  1.1  christos 	aese	v3.16b, v28.16b
   5809  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   5810  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   5811  1.1  christos 
   5812  1.1  christos 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   5813  1.1  christos 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   5814  1.1  christos 
   5815  1.1  christos 	aese	v2.16b, v30.16b
   5816  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   5817  1.1  christos 	fmov	d0, x10                               //CTR block 4k+8
   5818  1.1  christos 
   5819  1.1  christos 	aese	v3.16b, v29.16b
   5820  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   5821  1.1  christos 	fmov	v0.d[1], x9                               //CTR block 4k+8
   5822  1.1  christos 
   5823  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   5824  1.1  christos 	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
   5825  1.1  christos 	rev	w9, w12                                 //CTR block 4k+9
   5826  1.1  christos 
   5827  1.1  christos 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   5828  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
   5829  1.1  christos 	cmp	x0, x5                   //.LOOP CONTROL
   5830  1.1  christos 
   5831  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+9
   5832  1.1  christos 
   5833  1.1  christos 	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
   5834  1.1  christos #ifdef __AARCH64EB__
   5835  1.1  christos 	rev	x6, x6
   5836  1.1  christos #endif
   5837  1.1  christos 	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
   5838  1.1  christos #ifdef __AARCH64EB__
   5839  1.1  christos 	rev	x7, x7
   5840  1.1  christos #endif
   5841  1.1  christos 	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
   5842  1.1  christos 	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
   5843  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   5844  1.1  christos 
   5845  1.1  christos 	aese	v3.16b, v30.16b
   5846  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   5847  1.1  christos 	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
   5848  1.1  christos 
   5849  1.1  christos 	fmov	d1, x10                               //CTR block 4k+9
   5850  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   5851  1.1  christos 
   5852  1.1  christos 	fmov	v1.d[1], x9                               //CTR block 4k+9
   5853  1.1  christos 	rev	w9, w12                                 //CTR block 4k+10
   5854  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+10
   5855  1.1  christos 
   5856  1.1  christos 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   5857  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
   5858  1.1  christos 
   5859  1.1  christos 	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
   5860  1.1  christos 	eor	x20, x20, x14                   //AES block 4k+5 - round 14 high
   5861  1.1  christos #ifdef __AARCH64EB__
   5862  1.1  christos 	rev	x20, x20
   5863  1.1  christos #endif
   5864  1.1  christos 	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
   5865  1.1  christos 
   5866  1.1  christos 	eor	x19, x19, x13                   //AES block 4k+5 - round 14 low
   5867  1.1  christos #ifdef __AARCH64EB__
   5868  1.1  christos 	rev	x19, x19
   5869  1.1  christos #endif
   5870  1.1  christos 	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
   5871  1.1  christos 
   5872  1.1  christos 	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
   5873  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   5874  1.1  christos 	b.lt	.L256_dec_main_loop
   5875  1.1  christos 
   5876  1.1  christos 
   5877  1.1  christos .L256_dec_prepretail:	//PREPRETAIL
   5878  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
   5879  1.1  christos 	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
   5880  1.1  christos 	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
   5881  1.1  christos 
   5882  1.1  christos 	aese	v0.16b, v18.16b
   5883  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
   5884  1.1  christos 	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
   5885  1.1  christos 
   5886  1.1  christos 	aese	v1.16b, v18.16b
   5887  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
   5888  1.1  christos 	fmov	d2, x10                               //CTR block 4k+6
   5889  1.1  christos 
   5890  1.1  christos 	fmov	v2.d[1], x9                               //CTR block 4k+6
   5891  1.1  christos 	rev	w9, w12                                 //CTR block 4k+7
   5892  1.1  christos 	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
   5893  1.1  christos 
   5894  1.1  christos 	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
   5895  1.1  christos 	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
   5896  1.1  christos 	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
   5897  1.1  christos 
   5898  1.1  christos 	aese	v1.16b, v19.16b
   5899  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
   5900  1.1  christos 	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
   5901  1.1  christos 
   5902  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
   5903  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
   5904  1.1  christos 	fmov	d3, x10                               //CTR block 4k+7
   5905  1.1  christos 
   5906  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
   5907  1.1  christos 	fmov	v3.d[1], x9                               //CTR block 4k+7
   5908  1.1  christos 
   5909  1.1  christos 	aese	v2.16b, v18.16b
   5910  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
   5911  1.1  christos 	mov	d10, v17.d[1]                               //GHASH block 4k - mid
   5912  1.1  christos 
   5913  1.1  christos 	aese	v0.16b, v19.16b
   5914  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
   5915  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
   5916  1.1  christos 
   5917  1.1  christos 	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
   5918  1.1  christos 
   5919  1.1  christos 	aese	v2.16b, v19.16b
   5920  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
   5921  1.1  christos 	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
   5922  1.1  christos 
   5923  1.1  christos 	aese	v3.16b, v18.16b
   5924  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
   5925  1.1  christos 
   5926  1.1  christos 	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
   5927  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
   5928  1.1  christos 
   5929  1.1  christos 	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
   5930  1.1  christos 
   5931  1.1  christos 	aese	v3.16b, v19.16b
   5932  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
   5933  1.1  christos 	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
   5934  1.1  christos 
   5935  1.1  christos 	aese	v0.16b, v20.16b
   5936  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
   5937  1.1  christos 
   5938  1.1  christos 	aese	v1.16b, v20.16b
   5939  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
   5940  1.1  christos 	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
   5941  1.1  christos 
   5942  1.1  christos 	aese	v2.16b, v20.16b
   5943  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
   5944  1.1  christos 
   5945  1.1  christos 	aese	v0.16b, v21.16b
   5946  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
   5947  1.1  christos 	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
   5948  1.1  christos 
   5949  1.1  christos 	aese	v3.16b, v20.16b
   5950  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
   5951  1.1  christos 	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
   5952  1.1  christos 
   5953  1.1  christos 	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
   5954  1.1  christos 
   5955  1.1  christos 	aese	v0.16b, v22.16b
   5956  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
   5957  1.1  christos 
   5958  1.1  christos 	aese	v3.16b, v21.16b
   5959  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
   5960  1.1  christos 	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
   5961  1.1  christos 
   5962  1.1  christos 	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
   5963  1.1  christos 
   5964  1.1  christos 	aese	v0.16b, v23.16b
   5965  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
   5966  1.1  christos 	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
   5967  1.1  christos 
   5968  1.1  christos 	aese	v3.16b, v22.16b
   5969  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
   5970  1.1  christos 
   5971  1.1  christos 	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
   5972  1.1  christos 	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
   5973  1.1  christos 
   5974  1.1  christos 	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
   5975  1.1  christos 
   5976  1.1  christos 	aese	v3.16b, v23.16b
   5977  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
   5978  1.1  christos 	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
   5979  1.1  christos 
   5980  1.1  christos 	aese	v2.16b, v21.16b
   5981  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
   5982  1.1  christos 
   5983  1.1  christos 	aese	v1.16b, v21.16b
   5984  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
   5985  1.1  christos 	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
   5986  1.1  christos 
   5987  1.1  christos 	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
   5988  1.1  christos 
   5989  1.1  christos 	aese	v2.16b, v22.16b
   5990  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
   5991  1.1  christos 	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
   5992  1.1  christos 
   5993  1.1  christos 	aese	v1.16b, v22.16b
   5994  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
   5995  1.1  christos 
   5996  1.1  christos 	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
   5997  1.1  christos 
   5998  1.1  christos 	aese	v2.16b, v23.16b
   5999  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
   6000  1.1  christos 	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
   6001  1.1  christos 
   6002  1.1  christos 	aese	v1.16b, v23.16b
   6003  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
   6004  1.1  christos 
   6005  1.1  christos 	aese	v3.16b, v24.16b
   6006  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
   6007  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
   6008  1.1  christos 
   6009  1.1  christos 	aese	v2.16b, v24.16b
   6010  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
   6011  1.1  christos 
   6012  1.1  christos 	aese	v0.16b, v24.16b
   6013  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
   6014  1.1  christos 	movi	v8.8b, #0xc2
   6015  1.1  christos 
   6016  1.1  christos 	aese	v1.16b, v24.16b
   6017  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
   6018  1.1  christos 	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
   6019  1.1  christos 
   6020  1.1  christos 	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
   6021  1.1  christos 
   6022  1.1  christos 	aese	v3.16b, v25.16b
   6023  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
   6024  1.1  christos 	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
   6025  1.1  christos 
   6026  1.1  christos 	aese	v1.16b, v25.16b
   6027  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
   6028  1.1  christos 
   6029  1.1  christos 	aese	v0.16b, v25.16b
   6030  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
   6031  1.1  christos 	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
   6032  1.1  christos 
   6033  1.1  christos 	aese	v3.16b, v26.16b
   6034  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
   6035  1.1  christos 
   6036  1.1  christos 	aese	v2.16b, v25.16b
   6037  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
   6038  1.1  christos 	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   6039  1.1  christos 
   6040  1.1  christos 	aese	v1.16b, v26.16b
   6041  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
   6042  1.1  christos 
   6043  1.1  christos 	aese	v0.16b, v26.16b
   6044  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
   6045  1.1  christos 	shl	d8, d8, #56               //mod_constant
   6046  1.1  christos 
   6047  1.1  christos 	aese	v2.16b, v26.16b
   6048  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
   6049  1.1  christos 
   6050  1.1  christos 	aese	v1.16b, v27.16b
   6051  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
   6052  1.1  christos 	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
   6053  1.1  christos 
   6054  1.1  christos 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   6055  1.1  christos 
   6056  1.1  christos 	aese	v2.16b, v27.16b
   6057  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
   6058  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   6059  1.1  christos 
   6060  1.1  christos 	aese	v3.16b, v27.16b
   6061  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
   6062  1.1  christos 
   6063  1.1  christos 	aese	v0.16b, v27.16b
   6064  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
   6065  1.1  christos 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   6066  1.1  christos 
   6067  1.1  christos 	aese	v2.16b, v28.16b
   6068  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
   6069  1.1  christos 
   6070  1.1  christos 	aese	v3.16b, v28.16b
   6071  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
   6072  1.1  christos 
   6073  1.1  christos 	aese	v0.16b, v28.16b
   6074  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
   6075  1.1  christos 	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
   6076  1.1  christos #ifdef __AARCH64EB__
   6077  1.1  christos 	rev	x22, x22
   6078  1.1  christos #endif
   6079  1.1  christos 	aese	v1.16b, v28.16b
   6080  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
   6081  1.1  christos 	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
   6082  1.1  christos #ifdef __AARCH64EB__
   6083  1.1  christos 	rev	x23, x23
   6084  1.1  christos #endif
   6085  1.1  christos 	aese	v2.16b, v29.16b
   6086  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
   6087  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   6088  1.1  christos 
   6089  1.1  christos 	aese	v0.16b, v29.16b
   6090  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
   6091  1.1  christos 	add	w12, w12, #1                            //CTR block 4k+7
   6092  1.1  christos 
   6093  1.1  christos 	aese	v1.16b, v29.16b
   6094  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
   6095  1.1  christos 	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
   6096  1.1  christos #ifdef __AARCH64EB__
   6097  1.1  christos 	rev	x21, x21
   6098  1.1  christos #endif
   6099  1.1  christos 
   6100  1.1  christos 	aese	v2.16b, v30.16b
   6101  1.1  christos 	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
   6102  1.1  christos 
   6103  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   6104  1.1  christos 	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
   6105  1.1  christos #ifdef __AARCH64EB__
   6106  1.1  christos 	rev	x24, x24
   6107  1.1  christos #endif
   6108  1.1  christos 
   6109  1.1  christos 	aese	v3.16b, v29.16b
   6110  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
   6111  1.1  christos 	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
   6112  1.1  christos 
   6113  1.1  christos 	aese	v1.16b, v30.16b
   6114  1.1  christos 	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
   6115  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   6116  1.1  christos 
   6117  1.1  christos 	aese	v0.16b, v30.16b
   6118  1.1  christos 	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
   6119  1.1  christos 	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
   6120  1.1  christos 
   6121  1.1  christos 	aese	v3.16b, v30.16b
   6122  1.1  christos 	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
   6123  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   6124  1.1  christos 
   6125  1.1  christos 	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
   6126  1.1  christos 
   6127  1.1  christos 	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
   6128  1.1  christos 
   6129  1.1  christos 	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
   6130  1.1  christos 
   6131  1.1  christos 	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
   6132  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   6133  1.1  christos .L256_dec_tail:	//TAIL
   6134  1.1  christos 
   6135  1.1  christos 	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
   6136  1.1  christos 	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
   6137  1.1  christos 
   6138  1.1  christos 	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
   6139  1.1  christos 
   6140  1.1  christos 	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
   6141  1.1  christos 
   6142  1.1  christos 	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
   6143  1.1  christos 	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
   6144  1.1  christos 
   6145  1.1  christos 	cmp	x5, #48
   6146  1.1  christos 
   6147  1.1  christos 	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
   6148  1.1  christos #ifdef __AARCH64EB__
   6149  1.1  christos 	rev	x6, x6
   6150  1.1  christos #endif
   6151  1.1  christos 
   6152  1.1  christos 	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
   6153  1.1  christos #ifdef __AARCH64EB__
   6154  1.1  christos 	rev	x7, x7
   6155  1.1  christos #endif
   6156  1.1  christos 	b.gt	.L256_dec_blocks_more_than_3
   6157  1.1  christos 
   6158  1.1  christos 	sub	w12, w12, #1
   6159  1.1  christos 	mov	v3.16b, v2.16b
   6160  1.1  christos 	movi	v10.8b, #0
   6161  1.1  christos 
   6162  1.1  christos 	movi	v11.8b, #0
   6163  1.1  christos 	cmp	x5, #32
   6164  1.1  christos 
   6165  1.1  christos 	movi	v9.8b, #0
   6166  1.1  christos 	mov	v2.16b, v1.16b
   6167  1.1  christos 	b.gt	.L256_dec_blocks_more_than_2
   6168  1.1  christos 
   6169  1.1  christos 	sub	w12, w12, #1
   6170  1.1  christos 
   6171  1.1  christos 	mov	v3.16b, v1.16b
   6172  1.1  christos 	cmp	x5, #16
   6173  1.1  christos 	b.gt	.L256_dec_blocks_more_than_1
   6174  1.1  christos 
   6175  1.1  christos 	sub	w12, w12, #1
   6176  1.1  christos 	b	.L256_dec_blocks_less_than_1
   6177  1.1  christos .L256_dec_blocks_more_than_3:	//blocks	left >  3
   6178  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
   6179  1.1  christos 	ld1	{ v5.16b}, [x0], #16                     //AES final-2 block - load ciphertext
   6180  1.1  christos 
   6181  1.1  christos 	stp	x6, x7, [x2], #16       //AES final-3 block  - store result
   6182  1.1  christos 
   6183  1.1  christos 	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
   6184  1.1  christos 
   6185  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   6186  1.1  christos 
   6187  1.1  christos 	eor	v0.16b, v5.16b, v1.16b                           //AES final-2 block - result
   6188  1.1  christos 
   6189  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
   6190  1.1  christos 
   6191  1.1  christos 	mov	x6, v0.d[0]                           //AES final-2 block - mov low
   6192  1.1  christos 
   6193  1.1  christos 	mov	x7, v0.d[1]                           //AES final-2 block - mov high
   6194  1.1  christos 
   6195  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
   6196  1.1  christos 
   6197  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   6198  1.1  christos 
   6199  1.1  christos 	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
   6200  1.1  christos 
   6201  1.1  christos 	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
   6202  1.1  christos 	eor	x6, x6, x13                  //AES final-2 block - round 14 low
   6203  1.1  christos #ifdef __AARCH64EB__
   6204  1.1  christos 	rev	x6, x6
   6205  1.1  christos #endif
   6206  1.1  christos 
   6207  1.1  christos 	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
   6208  1.1  christos 	eor	x7, x7, x14                  //AES final-2 block - round 14 high
   6209  1.1  christos #ifdef __AARCH64EB__
   6210  1.1  christos 	rev	x7, x7
   6211  1.1  christos #endif
   6212  1.1  christos .L256_dec_blocks_more_than_2:	//blocks	left >  2
   6213  1.1  christos 
   6214  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
   6215  1.1  christos 	ld1	{ v5.16b}, [x0], #16                     //AES final-1 block - load ciphertext
   6216  1.1  christos 
   6217  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   6218  1.1  christos 	stp	x6, x7, [x2], #16       //AES final-2 block  - store result
   6219  1.1  christos 
   6220  1.1  christos 	eor	v0.16b, v5.16b, v2.16b                           //AES final-1 block - result
   6221  1.1  christos 
   6222  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
   6223  1.1  christos 
   6224  1.1  christos 	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
   6225  1.1  christos 
   6226  1.1  christos 	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
   6227  1.1  christos 
   6228  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
   6229  1.1  christos 	mov	x6, v0.d[0]                           //AES final-1 block - mov low
   6230  1.1  christos 
   6231  1.1  christos 	mov	x7, v0.d[1]                           //AES final-1 block - mov high
   6232  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
   6233  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   6234  1.1  christos 
   6235  1.1  christos 	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
   6236  1.1  christos 
   6237  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
   6238  1.1  christos 	eor	x6, x6, x13                  //AES final-1 block - round 14 low
   6239  1.1  christos #ifdef __AARCH64EB__
   6240  1.1  christos 	rev	x6, x6
   6241  1.1  christos #endif
   6242  1.1  christos 
   6243  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
   6244  1.1  christos 	eor	x7, x7, x14                  //AES final-1 block - round 14 high
   6245  1.1  christos #ifdef __AARCH64EB__
   6246  1.1  christos 	rev	x7, x7
   6247  1.1  christos #endif
   6248  1.1  christos .L256_dec_blocks_more_than_1:	//blocks	left >  1
   6249  1.1  christos 
   6250  1.1  christos 	stp	x6, x7, [x2], #16       //AES final-1 block  - store result
   6251  1.1  christos 	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
   6252  1.1  christos 
   6253  1.1  christos 	ld1	{ v5.16b}, [x0], #16                     //AES final block - load ciphertext
   6254  1.1  christos 
   6255  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
   6256  1.1  christos 	movi	v8.8b, #0                                       //suppress further partial tag feed in
   6257  1.1  christos 
   6258  1.1  christos 	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
   6259  1.1  christos 
   6260  1.1  christos 	eor	v0.16b, v5.16b, v3.16b                           //AES final block - result
   6261  1.1  christos 
   6262  1.1  christos 	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
   6263  1.1  christos 
   6264  1.1  christos 	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
   6265  1.1  christos 
   6266  1.1  christos 	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
   6267  1.1  christos 	mov	x6, v0.d[0]                           //AES final block - mov low
   6268  1.1  christos 
   6269  1.1  christos 	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
   6270  1.1  christos 
   6271  1.1  christos 	mov	x7, v0.d[1]                           //AES final block - mov high
   6272  1.1  christos 
   6273  1.1  christos 	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
   6274  1.1  christos 	eor	x6, x6, x13                  //AES final block - round 14 low
   6275  1.1  christos #ifdef __AARCH64EB__
   6276  1.1  christos 	rev	x6, x6
   6277  1.1  christos #endif
   6278  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
   6279  1.1  christos 
   6280  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
   6281  1.1  christos 
   6282  1.1  christos 	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
   6283  1.1  christos 	eor	x7, x7, x14                  //AES final block - round 14 high
   6284  1.1  christos #ifdef __AARCH64EB__
   6285  1.1  christos 	rev	x7, x7
   6286  1.1  christos #endif
   6287  1.1  christos .L256_dec_blocks_less_than_1:	//blocks	left <= 1
   6288  1.1  christos 
   6289  1.1  christos 	and	x1, x1, #127                   //bit_length %= 128
   6290  1.1  christos 	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
   6291  1.1  christos 
   6292  1.1  christos 	sub	x1, x1, #128                   //bit_length -= 128
   6293  1.1  christos 	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
   6294  1.1  christos 
   6295  1.1  christos 	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
   6296  1.1  christos 	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
   6297  1.1  christos 
   6298  1.1  christos 	and	x1, x1, #127                   //bit_length %= 128
   6299  1.1  christos 
   6300  1.1  christos 	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
   6301  1.1  christos 	cmp	x1, #64
   6302  1.1  christos 
   6303  1.1  christos 	csel	x9, x13, x14, lt
   6304  1.1  christos 	csel	x10, x14, xzr, lt
   6305  1.1  christos 
   6306  1.1  christos 	fmov	d0, x9                                  //ctr0b is mask for last block
   6307  1.1  christos 	and	x6, x6, x9
   6308  1.1  christos 
   6309  1.1  christos 	mov	v0.d[1], x10
   6310  1.1  christos 	bic	x4, x4, x9          //mask out low existing bytes
   6311  1.1  christos 
   6312  1.1  christos #ifndef __AARCH64EB__
   6313  1.1  christos 	rev	w9, w12
   6314  1.1  christos #else
   6315  1.1  christos 	mov	w9, w12
   6316  1.1  christos #endif
   6317  1.1  christos 
   6318  1.1  christos 	bic	x5, x5, x10      //mask out high existing bytes
   6319  1.1  christos 
   6320  1.1  christos 	orr	x6, x6, x4
   6321  1.1  christos 
   6322  1.1  christos 	and	x7, x7, x10
   6323  1.1  christos 
   6324  1.1  christos 	orr	x7, x7, x5
   6325  1.1  christos 
   6326  1.1  christos 	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
   6327  1.1  christos 
   6328  1.1  christos 	rev64	v4.16b, v5.16b                                    //GHASH final block
   6329  1.1  christos 
   6330  1.1  christos 	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
   6331  1.1  christos 
   6332  1.1  christos 	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
   6333  1.1  christos 
   6334  1.1  christos 	mov	d8, v4.d[1]                                  //GHASH final block - mid
   6335  1.1  christos 
   6336  1.1  christos 	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
   6337  1.1  christos 
   6338  1.1  christos 	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
   6339  1.1  christos 
   6340  1.1  christos 	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
   6341  1.1  christos 
   6342  1.1  christos 	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
   6343  1.1  christos 
   6344  1.1  christos 	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
   6345  1.1  christos 
   6346  1.1  christos 	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
   6347  1.1  christos 	movi	v8.8b, #0xc2
   6348  1.1  christos 
   6349  1.1  christos 	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
   6350  1.1  christos 
   6351  1.1  christos 	shl	d8, d8, #56               //mod_constant
   6352  1.1  christos 
   6353  1.1  christos 	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
   6354  1.1  christos 
   6355  1.1  christos 	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
   6356  1.1  christos 
   6357  1.1  christos 	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
   6358  1.1  christos 
   6359  1.1  christos 	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
   6360  1.1  christos 
   6361  1.1  christos 	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
   6362  1.1  christos 
   6363  1.1  christos 	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
   6364  1.1  christos 
   6365  1.1  christos 	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
   6366  1.1  christos 
   6367  1.1  christos 	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
   6368  1.1  christos 
   6369  1.1  christos 	stp	x6, x7, [x2]
   6370  1.1  christos 
   6371  1.1  christos 	str	w9, [x16, #12]                          //store the updated counter
   6372  1.1  christos 
   6373  1.1  christos 	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
   6374  1.1  christos 	ext	v11.16b, v11.16b, v11.16b, #8
   6375  1.1  christos 	rev64	v11.16b, v11.16b
   6376  1.1  christos 	mov	x0, x15
   6377  1.1  christos 	st1	{ v11.16b }, [x3]
   6378  1.1  christos 
   6379  1.1  christos 	ldp	x21, x22, [sp, #16]
   6380  1.1  christos 	ldp	x23, x24, [sp, #32]
   6381  1.1  christos 	ldp	d8, d9, [sp, #48]
   6382  1.1  christos 	ldp	d10, d11, [sp, #64]
   6383  1.1  christos 	ldp	d12, d13, [sp, #80]
   6384  1.1  christos 	ldp	d14, d15, [sp, #96]
   6385  1.1  christos 	ldp	x19, x20, [sp], #112
   6386  1.1  christos 	ret
   6387  1.1  christos 
   6388  1.1  christos .L256_dec_ret:
   6389  1.1  christos 	mov	w0, #0x0
   6390  1.1  christos 	ret
   6391  1.1  christos .size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
   6392  1.2  christos .section	.rodata
   6393  1.1  christos .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   6394  1.1  christos .align	2
   6395  1.1  christos .align	2
   6396  1.1  christos #endif
   6397