Home | History | Annotate | Line # | Download | only in aarch64
      1  1.2  christos #include "arm_arch.h"
      2  1.2  christos 
      3  1.2  christos .section	.rodata
      4  1.1  christos 
      5  1.1  christos .type	_vpaes_consts,%object
      6  1.1  christos .align	7	// totally strategic alignment
      7  1.1  christos _vpaes_consts:
      8  1.1  christos .Lk_mc_forward:	//	mc_forward
      9  1.1  christos .quad	0x0407060500030201, 0x0C0F0E0D080B0A09
     10  1.1  christos .quad	0x080B0A0904070605, 0x000302010C0F0E0D
     11  1.1  christos .quad	0x0C0F0E0D080B0A09, 0x0407060500030201
     12  1.1  christos .quad	0x000302010C0F0E0D, 0x080B0A0904070605
     13  1.1  christos .Lk_mc_backward:	//	mc_backward
     14  1.1  christos .quad	0x0605040702010003, 0x0E0D0C0F0A09080B
     15  1.1  christos .quad	0x020100030E0D0C0F, 0x0A09080B06050407
     16  1.1  christos .quad	0x0E0D0C0F0A09080B, 0x0605040702010003
     17  1.1  christos .quad	0x0A09080B06050407, 0x020100030E0D0C0F
     18  1.1  christos .Lk_sr:	//	sr
     19  1.1  christos .quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
     20  1.1  christos .quad	0x030E09040F0A0500, 0x0B06010C07020D08
     21  1.1  christos .quad	0x0F060D040B020900, 0x070E050C030A0108
     22  1.1  christos .quad	0x0B0E0104070A0D00, 0x0306090C0F020508
     23  1.1  christos 
     24  1.1  christos //
     25  1.1  christos // "Hot" constants
     26  1.1  christos //
     27  1.1  christos .Lk_inv:	//	inv, inva
     28  1.1  christos .quad	0x0E05060F0D080180, 0x040703090A0B0C02
     29  1.1  christos .quad	0x01040A060F0B0780, 0x030D0E0C02050809
     30  1.1  christos .Lk_ipt:	//	input transform (lo, hi)
     31  1.1  christos .quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
     32  1.1  christos .quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
     33  1.1  christos .Lk_sbo:	//	sbou, sbot
     34  1.1  christos .quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
     35  1.1  christos .quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
     36  1.1  christos .Lk_sb1:	//	sb1u, sb1t
     37  1.1  christos .quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
     38  1.1  christos .quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
     39  1.1  christos .Lk_sb2:	//	sb2u, sb2t
     40  1.1  christos .quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
     41  1.1  christos .quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
     42  1.1  christos 
     43  1.1  christos //
     44  1.1  christos //  Decryption stuff
     45  1.1  christos //
     46  1.1  christos .Lk_dipt:	//	decryption input transform
     47  1.1  christos .quad	0x0F505B040B545F00, 0x154A411E114E451A
     48  1.1  christos .quad	0x86E383E660056500, 0x12771772F491F194
     49  1.1  christos .Lk_dsbo:	//	decryption sbox final output
     50  1.1  christos .quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
     51  1.1  christos .quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
     52  1.1  christos .Lk_dsb9:	//	decryption sbox output *9*u, *9*t
     53  1.1  christos .quad	0x851C03539A86D600, 0xCAD51F504F994CC9
     54  1.1  christos .quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
     55  1.1  christos .Lk_dsbd:	//	decryption sbox output *D*u, *D*t
     56  1.1  christos .quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
     57  1.1  christos .quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
     58  1.1  christos .Lk_dsbb:	//	decryption sbox output *B*u, *B*t
     59  1.1  christos .quad	0xD022649296B44200, 0x602646F6B0F2D404
     60  1.1  christos .quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
     61  1.1  christos .Lk_dsbe:	//	decryption sbox output *E*u, *E*t
     62  1.1  christos .quad	0x46F2929626D4D000, 0x2242600464B4F6B0
     63  1.1  christos .quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
     64  1.1  christos 
     65  1.1  christos //
     66  1.1  christos //  Key schedule constants
     67  1.1  christos //
     68  1.1  christos .Lk_dksd:	//	decryption key schedule: invskew x*D
     69  1.1  christos .quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
     70  1.1  christos .quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
     71  1.1  christos .Lk_dksb:	//	decryption key schedule: invskew x*B
     72  1.1  christos .quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
     73  1.1  christos .quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
     74  1.1  christos .Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
     75  1.1  christos .quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
     76  1.1  christos .quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
     77  1.1  christos .Lk_dks9:	//	decryption key schedule: invskew x*9
     78  1.1  christos .quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
     79  1.1  christos .quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
     80  1.1  christos 
     81  1.1  christos .Lk_rcon:	//	rcon
     82  1.1  christos .quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
     83  1.1  christos 
     84  1.1  christos .Lk_opt:	//	output transform
     85  1.1  christos .quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
     86  1.1  christos .quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
     87  1.1  christos .Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
     88  1.1  christos .quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
     89  1.1  christos .quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
     90  1.1  christos 
     91  1.1  christos .byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
     92  1.1  christos .align	2
     93  1.1  christos .size	_vpaes_consts,.-_vpaes_consts
     94  1.1  christos .align	6
     95  1.2  christos 
     96  1.2  christos .text
     97  1.2  christos 
     98  1.1  christos //
     99  1.1  christos //  _aes_preheat
    100  1.1  christos //
    101  1.1  christos //  Fills register %r10 -> .aes_consts (so you can -fPIC)
    102  1.1  christos //  and %xmm9-%xmm15 as specified below.
    103  1.1  christos //
    104  1.1  christos .type	_vpaes_encrypt_preheat,%function
    105  1.1  christos .align	4
    106  1.1  christos _vpaes_encrypt_preheat:
    107  1.2  christos 	adrp	x10, .Lk_inv
    108  1.2  christos 	add	x10, x10, #:lo12:.Lk_inv
    109  1.1  christos 	movi	v17.16b, #0x0f
    110  1.1  christos 	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
    111  1.1  christos 	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
    112  1.1  christos 	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
    113  1.1  christos 	ret
    114  1.1  christos .size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
    115  1.1  christos 
    116  1.1  christos //
    117  1.1  christos //  _aes_encrypt_core
    118  1.1  christos //
    119  1.1  christos //  AES-encrypt %xmm0.
    120  1.1  christos //
    121  1.1  christos //  Inputs:
    122  1.1  christos //     %xmm0 = input
    123  1.1  christos //     %xmm9-%xmm15 as in _vpaes_preheat
    124  1.1  christos //    (%rdx) = scheduled keys
    125  1.1  christos //
    126  1.1  christos //  Output in %xmm0
    127  1.1  christos //  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
    128  1.1  christos //  Preserves %xmm6 - %xmm8 so you get some local vectors
    129  1.1  christos //
    130  1.1  christos //
    131  1.1  christos .type	_vpaes_encrypt_core,%function
    132  1.1  christos .align	4
    133  1.1  christos _vpaes_encrypt_core:
    134  1.1  christos 	mov	x9, x2
    135  1.1  christos 	ldr	w8, [x2,#240]			// pull rounds
    136  1.2  christos 	adrp	x11, .Lk_mc_forward+16
    137  1.2  christos 	add	x11, x11, #:lo12:.Lk_mc_forward+16
    138  1.1  christos 						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
    139  1.1  christos 	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
    140  1.1  christos 	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
    141  1.1  christos 	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
    142  1.1  christos 	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
    143  1.1  christos 						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
    144  1.1  christos 	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
    145  1.1  christos 	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
    146  1.1  christos 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
    147  1.1  christos 	b	.Lenc_entry
    148  1.1  christos 
    149  1.1  christos .align	4
    150  1.1  christos .Lenc_loop:
    151  1.1  christos 	// middle of middle round
    152  1.1  christos 	add	x10, x11, #0x40
    153  1.1  christos 	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
    154  1.1  christos 	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
    155  1.1  christos 	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
    156  1.1  christos 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    157  1.1  christos 	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
    158  1.1  christos 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    159  1.1  christos 	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
    160  1.1  christos 	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
    161  1.1  christos 	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
    162  1.1  christos 	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
    163  1.1  christos 	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
    164  1.1  christos 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
    165  1.1  christos 	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
    166  1.1  christos 	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
    167  1.1  christos 	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
    168  1.1  christos 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
    169  1.1  christos 	sub	w8, w8, #1			// nr--
    170  1.1  christos 
    171  1.1  christos .Lenc_entry:
    172  1.1  christos 	// top of round
    173  1.1  christos 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
    174  1.1  christos 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
    175  1.1  christos 	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
    176  1.1  christos 	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    177  1.1  christos 	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
    178  1.1  christos 	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
    179  1.1  christos 	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    180  1.1  christos 	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
    181  1.1  christos 	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
    182  1.1  christos 	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
    183  1.1  christos 	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
    184  1.1  christos 	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
    185  1.1  christos 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
    186  1.1  christos 	cbnz	w8, .Lenc_loop
    187  1.1  christos 
    188  1.1  christos 	// middle of last round
    189  1.1  christos 	add	x10, x11, #0x80
    190  1.1  christos 						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
    191  1.1  christos 						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
    192  1.1  christos 	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    193  1.1  christos 	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
    194  1.1  christos 	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
    195  1.1  christos 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    196  1.1  christos 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    197  1.1  christos 	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
    198  1.1  christos 	ret
    199  1.1  christos .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
    200  1.1  christos 
    201  1.1  christos .globl	vpaes_encrypt
    202  1.1  christos .type	vpaes_encrypt,%function
    203  1.1  christos .align	4
    204  1.1  christos vpaes_encrypt:
    205  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
    206  1.1  christos 	stp	x29,x30,[sp,#-16]!
    207  1.1  christos 	add	x29,sp,#0
    208  1.1  christos 
    209  1.1  christos 	ld1	{v7.16b}, [x0]
    210  1.1  christos 	bl	_vpaes_encrypt_preheat
    211  1.1  christos 	bl	_vpaes_encrypt_core
    212  1.1  christos 	st1	{v0.16b}, [x1]
    213  1.1  christos 
    214  1.1  christos 	ldp	x29,x30,[sp],#16
    215  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    216  1.1  christos 	ret
    217  1.1  christos .size	vpaes_encrypt,.-vpaes_encrypt
    218  1.1  christos 
    219  1.1  christos .type	_vpaes_encrypt_2x,%function
    220  1.1  christos .align	4
    221  1.1  christos _vpaes_encrypt_2x:
    222  1.1  christos 	mov	x9, x2
    223  1.1  christos 	ldr	w8, [x2,#240]			// pull rounds
    224  1.2  christos 	adrp	x11, .Lk_mc_forward+16
    225  1.2  christos 	add	x11, x11, #:lo12:.Lk_mc_forward+16
    226  1.1  christos 						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
    227  1.1  christos 	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
    228  1.1  christos 	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
    229  1.1  christos 	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
    230  1.1  christos 	and	v9.16b,  v15.16b,  v17.16b
    231  1.1  christos 	ushr	v8.16b,  v15.16b,  #4
    232  1.1  christos 	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
    233  1.1  christos 	tbl	v9.16b,  {v20.16b}, v9.16b
    234  1.1  christos 						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
    235  1.1  christos 	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
    236  1.1  christos 	tbl	v10.16b, {v21.16b}, v8.16b
    237  1.1  christos 	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
    238  1.1  christos 	eor	v8.16b,  v9.16b,   v16.16b
    239  1.1  christos 	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
    240  1.1  christos 	eor	v8.16b,  v8.16b,   v10.16b
    241  1.1  christos 	b	.Lenc_2x_entry
    242  1.1  christos 
    243  1.1  christos .align	4
    244  1.1  christos .Lenc_2x_loop:
    245  1.1  christos 	// middle of middle round
    246  1.1  christos 	add	x10, x11, #0x40
    247  1.1  christos 	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
    248  1.1  christos 	tbl	v12.16b, {v25.16b}, v10.16b
    249  1.1  christos 	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
    250  1.1  christos 	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
    251  1.1  christos 	tbl	v8.16b,  {v24.16b}, v11.16b
    252  1.1  christos 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    253  1.1  christos 	eor	v12.16b, v12.16b, v16.16b
    254  1.1  christos 	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
    255  1.1  christos 	tbl	v13.16b, {v27.16b}, v10.16b
    256  1.1  christos 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    257  1.1  christos 	eor	v8.16b,  v8.16b,  v12.16b
    258  1.1  christos 	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
    259  1.1  christos 	tbl	v10.16b, {v26.16b}, v11.16b
    260  1.1  christos 	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
    261  1.1  christos 	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
    262  1.1  christos 	tbl	v11.16b, {v8.16b}, v1.16b
    263  1.1  christos 	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
    264  1.1  christos 	eor	v10.16b, v10.16b, v13.16b
    265  1.1  christos 	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
    266  1.1  christos 	tbl	v8.16b,  {v8.16b}, v4.16b
    267  1.1  christos 	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
    268  1.1  christos 	eor	v11.16b, v11.16b, v10.16b
    269  1.1  christos 	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
    270  1.1  christos 	tbl	v12.16b, {v11.16b},v1.16b
    271  1.1  christos 	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
    272  1.1  christos 	eor	v8.16b,  v8.16b,  v11.16b
    273  1.1  christos 	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
    274  1.1  christos 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
    275  1.1  christos 	eor	v8.16b,  v8.16b,  v12.16b
    276  1.1  christos 	sub	w8, w8, #1			// nr--
    277  1.1  christos 
    278  1.1  christos .Lenc_2x_entry:
    279  1.1  christos 	// top of round
    280  1.1  christos 	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
    281  1.1  christos 	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
    282  1.1  christos 	and	v9.16b,  v8.16b, v17.16b
    283  1.1  christos 	ushr	v8.16b,  v8.16b, #4
    284  1.1  christos 	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
    285  1.1  christos 	tbl	v13.16b, {v19.16b},v9.16b
    286  1.1  christos 	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    287  1.1  christos 	eor	v9.16b,  v9.16b,  v8.16b
    288  1.1  christos 	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
    289  1.1  christos 	tbl	v11.16b, {v18.16b},v8.16b
    290  1.1  christos 	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
    291  1.1  christos 	tbl	v12.16b, {v18.16b},v9.16b
    292  1.1  christos 	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    293  1.1  christos 	eor	v11.16b, v11.16b, v13.16b
    294  1.1  christos 	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
    295  1.1  christos 	eor	v12.16b, v12.16b, v13.16b
    296  1.1  christos 	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
    297  1.1  christos 	tbl	v10.16b, {v18.16b},v11.16b
    298  1.1  christos 	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
    299  1.1  christos 	tbl	v11.16b, {v18.16b},v12.16b
    300  1.1  christos 	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
    301  1.1  christos 	eor	v10.16b, v10.16b, v9.16b
    302  1.1  christos 	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
    303  1.1  christos 	eor	v11.16b, v11.16b, v8.16b
    304  1.1  christos 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
    305  1.1  christos 	cbnz	w8, .Lenc_2x_loop
    306  1.1  christos 
    307  1.1  christos 	// middle of last round
    308  1.1  christos 	add	x10, x11, #0x80
    309  1.1  christos 						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
    310  1.1  christos 						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
    311  1.1  christos 	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    312  1.1  christos 	tbl	v12.16b, {v22.16b}, v10.16b
    313  1.1  christos 	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
    314  1.1  christos 	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
    315  1.1  christos 	tbl	v8.16b,  {v23.16b}, v11.16b
    316  1.1  christos 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
    317  1.1  christos 	eor	v12.16b, v12.16b, v16.16b
    318  1.1  christos 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
    319  1.1  christos 	eor	v8.16b,  v8.16b,  v12.16b
    320  1.1  christos 	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
    321  1.1  christos 	tbl	v1.16b,  {v8.16b},v1.16b
    322  1.1  christos 	ret
    323  1.1  christos .size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
    324  1.1  christos 
    325  1.1  christos .type	_vpaes_decrypt_preheat,%function
    326  1.1  christos .align	4
    327  1.1  christos _vpaes_decrypt_preheat:
    328  1.2  christos 	adrp	x10, .Lk_inv
    329  1.2  christos 	add	x10, x10, #:lo12:.Lk_inv
    330  1.1  christos 	movi	v17.16b, #0x0f
    331  1.2  christos 	adrp	x11, .Lk_dipt
    332  1.2  christos 	add	x11, x11, #:lo12:.Lk_dipt
    333  1.1  christos 	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
    334  1.1  christos 	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
    335  1.1  christos 	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
    336  1.1  christos 	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
    337  1.1  christos 	ret
    338  1.1  christos .size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
    339  1.1  christos 
    340  1.1  christos //
    341  1.1  christos //  Decryption core
    342  1.1  christos //
    343  1.1  christos //  Same API as encryption core.
    344  1.1  christos //
    345  1.1  christos .type	_vpaes_decrypt_core,%function
    346  1.1  christos .align	4
    347  1.1  christos _vpaes_decrypt_core:
    348  1.1  christos 	mov	x9, x2
    349  1.1  christos 	ldr	w8, [x2,#240]			// pull rounds
    350  1.1  christos 
    351  1.1  christos 						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
    352  1.1  christos 	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
    353  1.1  christos 	eor	x11, x11, #0x30			// xor		$0x30,	%r11
    354  1.2  christos 	adrp	x10, .Lk_sr
    355  1.2  christos 	add	x10, x10, #:lo12:.Lk_sr
    356  1.1  christos 	and	x11, x11, #0x30			// and		$0x30,	%r11
    357  1.1  christos 	add	x11, x11, x10
    358  1.2  christos 	adrp	x10, .Lk_mc_forward+48
    359  1.2  christos 	add	x10, x10, #:lo12:.Lk_mc_forward+48
    360  1.1  christos 
    361  1.1  christos 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
    362  1.1  christos 	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
    363  1.1  christos 	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
    364  1.1  christos 	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
    365  1.1  christos 	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
    366  1.1  christos 						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
    367  1.1  christos 	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
    368  1.1  christos 	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
    369  1.1  christos 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
    370  1.1  christos 	b	.Ldec_entry
    371  1.1  christos 
    372  1.1  christos .align	4
    373  1.1  christos .Ldec_loop:
    374  1.1  christos //
    375  1.1  christos //  Inverse mix columns
    376  1.1  christos //
    377  1.1  christos 						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
    378  1.1  christos 						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
    379  1.1  christos 	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
    380  1.1  christos 	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
    381  1.1  christos 	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
    382  1.1  christos 						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
    383  1.1  christos 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    384  1.1  christos 						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
    385  1.1  christos 
    386  1.1  christos 	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
    387  1.1  christos 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    388  1.1  christos 	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
    389  1.1  christos 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    390  1.1  christos 						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
    391  1.1  christos 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    392  1.1  christos 						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
    393  1.1  christos 
    394  1.1  christos 	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
    395  1.1  christos 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    396  1.1  christos 	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
    397  1.1  christos 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    398  1.1  christos 						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
    399  1.1  christos 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    400  1.1  christos 						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
    401  1.1  christos 
    402  1.1  christos 	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
    403  1.1  christos 	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    404  1.1  christos 	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
    405  1.1  christos 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    406  1.1  christos 	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
    407  1.1  christos 	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    408  1.1  christos 	sub	w8, w8, #1			// sub		$1,%rax			# nr--
    409  1.1  christos 
    410  1.1  christos .Ldec_entry:
    411  1.1  christos 	// top of round
    412  1.1  christos 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
    413  1.1  christos 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
    414  1.1  christos 	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
    415  1.1  christos 	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    416  1.1  christos 	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
    417  1.1  christos 	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
    418  1.1  christos 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    419  1.1  christos 	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
    420  1.1  christos 	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
    421  1.1  christos 	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
    422  1.1  christos 	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
    423  1.1  christos 	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
    424  1.1  christos 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
    425  1.1  christos 	cbnz	w8, .Ldec_loop
    426  1.1  christos 
    427  1.1  christos 	// middle of last round
    428  1.1  christos 						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
    429  1.1  christos 	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    430  1.1  christos 						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
    431  1.1  christos 	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
    432  1.1  christos 	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
    433  1.1  christos 	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
    434  1.1  christos 	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
    435  1.1  christos 	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
    436  1.1  christos 	ret
    437  1.1  christos .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
    438  1.1  christos 
    439  1.1  christos .globl	vpaes_decrypt
    440  1.1  christos .type	vpaes_decrypt,%function
    441  1.1  christos .align	4
    442  1.1  christos vpaes_decrypt:
    443  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
    444  1.1  christos 	stp	x29,x30,[sp,#-16]!
    445  1.1  christos 	add	x29,sp,#0
    446  1.1  christos 
    447  1.1  christos 	ld1	{v7.16b}, [x0]
    448  1.1  christos 	bl	_vpaes_decrypt_preheat
    449  1.1  christos 	bl	_vpaes_decrypt_core
    450  1.1  christos 	st1	{v0.16b}, [x1]
    451  1.1  christos 
    452  1.1  christos 	ldp	x29,x30,[sp],#16
    453  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    454  1.1  christos 	ret
    455  1.1  christos .size	vpaes_decrypt,.-vpaes_decrypt
    456  1.1  christos 
    457  1.1  christos // v14-v15 input, v0-v1 output
    458  1.1  christos .type	_vpaes_decrypt_2x,%function
    459  1.1  christos .align	4
    460  1.1  christos _vpaes_decrypt_2x:
    461  1.1  christos 	mov	x9, x2
    462  1.1  christos 	ldr	w8, [x2,#240]			// pull rounds
    463  1.1  christos 
    464  1.1  christos 						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
    465  1.1  christos 	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
    466  1.1  christos 	eor	x11, x11, #0x30			// xor		$0x30,	%r11
    467  1.2  christos 	adrp	x10, .Lk_sr
    468  1.2  christos 	add	x10, x10, #:lo12:.Lk_sr
    469  1.1  christos 	and	x11, x11, #0x30			// and		$0x30,	%r11
    470  1.1  christos 	add	x11, x11, x10
    471  1.2  christos 	adrp	x10, .Lk_mc_forward+48
    472  1.2  christos 	add	x10, x10, #:lo12:.Lk_mc_forward+48
    473  1.1  christos 
    474  1.1  christos 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
    475  1.1  christos 	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
    476  1.1  christos 	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
    477  1.1  christos 	and	v9.16b,  v15.16b, v17.16b
    478  1.1  christos 	ushr	v8.16b,  v15.16b, #4
    479  1.1  christos 	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
    480  1.1  christos 	tbl	v10.16b, {v20.16b},v9.16b
    481  1.1  christos 	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
    482  1.1  christos 						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
    483  1.1  christos 	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
    484  1.1  christos 	tbl	v8.16b,  {v21.16b},v8.16b
    485  1.1  christos 	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
    486  1.1  christos 	eor	v10.16b, v10.16b, v16.16b
    487  1.1  christos 	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
    488  1.1  christos 	eor	v8.16b,  v8.16b,  v10.16b
    489  1.1  christos 	b	.Ldec_2x_entry
    490  1.1  christos 
    491  1.1  christos .align	4
    492  1.1  christos .Ldec_2x_loop:
    493  1.1  christos //
    494  1.1  christos //  Inverse mix columns
    495  1.1  christos //
    496  1.1  christos 						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
    497  1.1  christos 						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
    498  1.1  christos 	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
    499  1.1  christos 	tbl	v12.16b, {v24.16b}, v10.16b
    500  1.1  christos 	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
    501  1.1  christos 	tbl	v9.16b,  {v25.16b}, v11.16b
    502  1.1  christos 	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
    503  1.1  christos 	eor	v8.16b,  v12.16b, v16.16b
    504  1.1  christos 						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
    505  1.1  christos 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    506  1.1  christos 	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    507  1.1  christos 						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
    508  1.1  christos 
    509  1.1  christos 	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
    510  1.1  christos 	tbl	v12.16b, {v26.16b}, v10.16b
    511  1.1  christos 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    512  1.1  christos 	tbl	v8.16b,  {v8.16b},v5.16b
    513  1.1  christos 	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
    514  1.1  christos 	tbl	v9.16b,  {v27.16b}, v11.16b
    515  1.1  christos 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    516  1.1  christos 	eor	v8.16b,  v8.16b,  v12.16b
    517  1.1  christos 						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
    518  1.1  christos 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    519  1.1  christos 	eor	v8.16b,  v8.16b,  v9.16b
    520  1.1  christos 						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
    521  1.1  christos 
    522  1.1  christos 	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
    523  1.1  christos 	tbl	v12.16b, {v28.16b}, v10.16b
    524  1.1  christos 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    525  1.1  christos 	tbl	v8.16b,  {v8.16b},v5.16b
    526  1.1  christos 	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
    527  1.1  christos 	tbl	v9.16b,  {v29.16b}, v11.16b
    528  1.1  christos 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    529  1.1  christos 	eor	v8.16b,  v8.16b,  v12.16b
    530  1.1  christos 						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
    531  1.1  christos 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    532  1.1  christos 	eor	v8.16b,  v8.16b,  v9.16b
    533  1.1  christos 						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
    534  1.1  christos 
    535  1.1  christos 	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
    536  1.1  christos 	tbl	v12.16b, {v30.16b}, v10.16b
    537  1.1  christos 	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
    538  1.1  christos 	tbl	v8.16b,  {v8.16b},v5.16b
    539  1.1  christos 	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
    540  1.1  christos 	tbl	v9.16b,  {v31.16b}, v11.16b
    541  1.1  christos 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
    542  1.1  christos 	eor	v8.16b,  v8.16b,  v12.16b
    543  1.1  christos 	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
    544  1.1  christos 	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
    545  1.1  christos 	eor	v8.16b,  v8.16b,  v9.16b
    546  1.1  christos 	sub	w8, w8, #1			// sub		$1,%rax			# nr--
    547  1.1  christos 
    548  1.1  christos .Ldec_2x_entry:
    549  1.1  christos 	// top of round
    550  1.1  christos 	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
    551  1.1  christos 	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
    552  1.1  christos 	and	v9.16b,  v8.16b,  v17.16b
    553  1.1  christos 	ushr	v8.16b,  v8.16b,  #4
    554  1.1  christos 	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
    555  1.1  christos 	tbl	v10.16b, {v19.16b},v9.16b
    556  1.1  christos 	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
    557  1.1  christos 	eor	v9.16b,	 v9.16b,  v8.16b
    558  1.1  christos 	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
    559  1.1  christos 	tbl	v11.16b, {v18.16b},v8.16b
    560  1.1  christos 	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
    561  1.1  christos 	tbl	v12.16b, {v18.16b},v9.16b
    562  1.1  christos 	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
    563  1.1  christos 	eor	v11.16b, v11.16b, v10.16b
    564  1.1  christos 	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
    565  1.1  christos 	eor	v12.16b, v12.16b, v10.16b
    566  1.1  christos 	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
    567  1.1  christos 	tbl	v10.16b, {v18.16b},v11.16b
    568  1.1  christos 	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
    569  1.1  christos 	tbl	v11.16b, {v18.16b},v12.16b
    570  1.1  christos 	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
    571  1.1  christos 	eor	v10.16b, v10.16b, v9.16b
    572  1.1  christos 	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
    573  1.1  christos 	eor	v11.16b, v11.16b, v8.16b
    574  1.1  christos 	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
    575  1.1  christos 	cbnz	w8, .Ldec_2x_loop
    576  1.1  christos 
    577  1.1  christos 	// middle of last round
    578  1.1  christos 						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
    579  1.1  christos 	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
    580  1.1  christos 	tbl	v12.16b, {v22.16b}, v10.16b
    581  1.1  christos 						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
    582  1.1  christos 	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
    583  1.1  christos 	tbl	v9.16b,  {v23.16b}, v11.16b
    584  1.1  christos 	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
    585  1.1  christos 	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
    586  1.1  christos 	eor	v12.16b, v12.16b, v16.16b
    587  1.1  christos 	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
    588  1.1  christos 	eor	v8.16b,  v9.16b,  v12.16b
    589  1.1  christos 	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
    590  1.1  christos 	tbl	v1.16b,  {v8.16b},v2.16b
    591  1.1  christos 	ret
    592  1.1  christos .size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
    593  1.1  christos ////////////////////////////////////////////////////////
    594  1.1  christos //                                                    //
    595  1.1  christos //                  AES key schedule                  //
    596  1.1  christos //                                                    //
    597  1.1  christos ////////////////////////////////////////////////////////
    598  1.1  christos .type	_vpaes_key_preheat,%function
    599  1.1  christos .align	4
    600  1.1  christos _vpaes_key_preheat:
    601  1.2  christos 	adrp	x10, .Lk_inv
    602  1.2  christos 	add	x10, x10, #:lo12:.Lk_inv
    603  1.1  christos 	movi	v16.16b, #0x5b			// .Lk_s63
    604  1.2  christos 	adrp	x11, .Lk_sb1
    605  1.2  christos 	add	x11, x11, #:lo12:.Lk_sb1
    606  1.1  christos 	movi	v17.16b, #0x0f			// .Lk_s0F
    607  1.1  christos 	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
    608  1.2  christos 	adrp	x10, .Lk_dksd
    609  1.2  christos 	add	x10, x10, #:lo12:.Lk_dksd
    610  1.1  christos 	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
    611  1.2  christos 	adrp	x11, .Lk_mc_forward
    612  1.2  christos 	add	x11, x11, #:lo12:.Lk_mc_forward
    613  1.1  christos 	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
    614  1.1  christos 	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
    615  1.1  christos 	ld1	{v8.2d}, [x10]			// .Lk_rcon
    616  1.1  christos 	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
    617  1.1  christos 	ret
    618  1.1  christos .size	_vpaes_key_preheat,.-_vpaes_key_preheat
    619  1.1  christos 
    620  1.1  christos .type	_vpaes_schedule_core,%function
    621  1.1  christos .align	4
    622  1.1  christos _vpaes_schedule_core:
    623  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
    624  1.1  christos 	stp	x29, x30, [sp,#-16]!
    625  1.1  christos 	add	x29,sp,#0
    626  1.1  christos 
    627  1.1  christos 	bl	_vpaes_key_preheat		// load the tables
    628  1.1  christos 
    629  1.1  christos 	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
    630  1.1  christos 
    631  1.1  christos 	// input transform
    632  1.1  christos 	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
    633  1.1  christos 	bl	_vpaes_schedule_transform
    634  1.1  christos 	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
    635  1.1  christos 
    636  1.2  christos 	adrp	x10, .Lk_sr
    637  1.2  christos 	add	x10, x10, #:lo12:.Lk_sr
    638  1.1  christos 	add	x8, x8, x10
    639  1.1  christos 	cbnz	w3, .Lschedule_am_decrypting
    640  1.1  christos 
    641  1.1  christos 	// encrypting, output zeroth round key after transform
    642  1.1  christos 	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
    643  1.1  christos 	b	.Lschedule_go
    644  1.1  christos 
    645  1.1  christos .Lschedule_am_decrypting:
    646  1.1  christos 	// decrypting, output zeroth round key after shiftrows
    647  1.1  christos 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
    648  1.1  christos 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
    649  1.1  christos 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
    650  1.1  christos 	eor	x8, x8, #0x30			// xor	$0x30, %r8
    651  1.1  christos 
    652  1.1  christos .Lschedule_go:
    653  1.1  christos 	cmp	w1, #192			// cmp	$192,	%esi
    654  1.1  christos 	b.hi	.Lschedule_256
    655  1.1  christos 	b.eq	.Lschedule_192
    656  1.1  christos 	// 128: fall though
    657  1.1  christos 
    658  1.1  christos //
    659  1.1  christos //  .schedule_128
    660  1.1  christos //
    661  1.1  christos //  128-bit specific part of key schedule.
    662  1.1  christos //
    663  1.1  christos //  This schedule is really simple, because all its parts
    664  1.1  christos //  are accomplished by the subroutines.
    665  1.1  christos //
    666  1.1  christos .Lschedule_128:
    667  1.1  christos 	mov	x0, #10			// mov	$10, %esi
    668  1.1  christos 
    669  1.1  christos .Loop_schedule_128:
    670  1.1  christos 	sub	x0, x0, #1			// dec	%esi
    671  1.1  christos 	bl	_vpaes_schedule_round
    672  1.1  christos 	cbz	x0, .Lschedule_mangle_last
    673  1.1  christos 	bl	_vpaes_schedule_mangle		// write output
    674  1.1  christos 	b	.Loop_schedule_128
    675  1.1  christos 
    676  1.1  christos //
    677  1.1  christos //  .aes_schedule_192
    678  1.1  christos //
    679  1.1  christos //  192-bit specific part of key schedule.
    680  1.1  christos //
    681  1.1  christos //  The main body of this schedule is the same as the 128-bit
    682  1.1  christos //  schedule, but with more smearing.  The long, high side is
    683  1.1  christos //  stored in %xmm7 as before, and the short, low side is in
    684  1.1  christos //  the high bits of %xmm6.
    685  1.1  christos //
    686  1.1  christos //  This schedule is somewhat nastier, however, because each
    687  1.1  christos //  round produces 192 bits of key material, or 1.5 round keys.
    688  1.1  christos //  Therefore, on each cycle we do 2 rounds and produce 3 round
    689  1.1  christos //  keys.
    690  1.1  christos //
    691  1.1  christos .align	4
    692  1.1  christos .Lschedule_192:
    693  1.1  christos 	sub	x0, x0, #8
    694  1.1  christos 	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
    695  1.1  christos 	bl	_vpaes_schedule_transform	// input transform
    696  1.1  christos 	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
    697  1.1  christos 	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
    698  1.1  christos 	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
    699  1.1  christos 	mov	x0, #4			// mov	$4,	%esi
    700  1.1  christos 
    701  1.1  christos .Loop_schedule_192:
    702  1.1  christos 	sub	x0, x0, #1			// dec	%esi
    703  1.1  christos 	bl	_vpaes_schedule_round
    704  1.1  christos 	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
    705  1.1  christos 	bl	_vpaes_schedule_mangle		// save key n
    706  1.1  christos 	bl	_vpaes_schedule_192_smear
    707  1.1  christos 	bl	_vpaes_schedule_mangle		// save key n+1
    708  1.1  christos 	bl	_vpaes_schedule_round
    709  1.1  christos 	cbz	x0, .Lschedule_mangle_last
    710  1.1  christos 	bl	_vpaes_schedule_mangle		// save key n+2
    711  1.1  christos 	bl	_vpaes_schedule_192_smear
    712  1.1  christos 	b	.Loop_schedule_192
    713  1.1  christos 
    714  1.1  christos //
    715  1.1  christos //  .aes_schedule_256
    716  1.1  christos //
    717  1.1  christos //  256-bit specific part of key schedule.
    718  1.1  christos //
    719  1.1  christos //  The structure here is very similar to the 128-bit
    720  1.1  christos //  schedule, but with an additional "low side" in
    721  1.1  christos //  %xmm6.  The low side's rounds are the same as the
    722  1.1  christos //  high side's, except no rcon and no rotation.
    723  1.1  christos //
    724  1.1  christos .align	4
    725  1.1  christos .Lschedule_256:
    726  1.1  christos 	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
    727  1.1  christos 	bl	_vpaes_schedule_transform	// input transform
    728  1.1  christos 	mov	x0, #7			// mov	$7, %esi
    729  1.1  christos 
    730  1.1  christos .Loop_schedule_256:
    731  1.1  christos 	sub	x0, x0, #1			// dec	%esi
    732  1.1  christos 	bl	_vpaes_schedule_mangle		// output low result
    733  1.1  christos 	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
    734  1.1  christos 
    735  1.1  christos 	// high round
    736  1.1  christos 	bl	_vpaes_schedule_round
    737  1.1  christos 	cbz	x0, .Lschedule_mangle_last
    738  1.1  christos 	bl	_vpaes_schedule_mangle
    739  1.1  christos 
    740  1.1  christos 	// low round. swap xmm7 and xmm6
    741  1.1  christos 	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
    742  1.1  christos 	movi	v4.16b, #0
    743  1.1  christos 	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
    744  1.1  christos 	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
    745  1.1  christos 	bl	_vpaes_schedule_low_round
    746  1.1  christos 	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
    747  1.1  christos 
    748  1.1  christos 	b	.Loop_schedule_256
    749  1.1  christos 
    750  1.1  christos //
    751  1.1  christos //  .aes_schedule_mangle_last
    752  1.1  christos //
    753  1.1  christos //  Mangler for last round of key schedule
    754  1.1  christos //  Mangles %xmm0
    755  1.1  christos //    when encrypting, outputs out(%xmm0) ^ 63
    756  1.1  christos //    when decrypting, outputs unskew(%xmm0)
    757  1.1  christos //
    758  1.1  christos //  Always called right before return... jumps to cleanup and exits
    759  1.1  christos //
    760  1.1  christos .align	4
    761  1.1  christos .Lschedule_mangle_last:
    762  1.1  christos 	// schedule last round key from xmm0
    763  1.2  christos 	adrp	x11, .Lk_deskew
    764  1.2  christos 	add	x11, x11, #:lo12:.Lk_deskew
    765  1.1  christos 	cbnz	w3, .Lschedule_mangle_last_dec
    766  1.1  christos 
    767  1.1  christos 	// encrypting
    768  1.1  christos 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
    769  1.2  christos 	adrp	x11, .Lk_opt
    770  1.2  christos 	add	x11, x11, #:lo12:.Lk_opt
    771  1.1  christos 	add	x2, x2, #32			// add	$32,	%rdx
    772  1.1  christos 	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
    773  1.1  christos 
    774  1.1  christos .Lschedule_mangle_last_dec:
    775  1.1  christos 	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
    776  1.1  christos 	sub	x2, x2, #16			// add	$-16,	%rdx
    777  1.1  christos 	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
    778  1.1  christos 	bl	_vpaes_schedule_transform	// output transform
    779  1.1  christos 	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
    780  1.1  christos 
    781  1.1  christos 	// cleanup
    782  1.1  christos 	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
    783  1.1  christos 	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
    784  1.1  christos 	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
    785  1.1  christos 	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
    786  1.1  christos 	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
    787  1.1  christos 	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
    788  1.1  christos 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
    789  1.1  christos 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
    790  1.1  christos 	ldp	x29, x30, [sp],#16
    791  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    792  1.1  christos 	ret
    793  1.1  christos .size	_vpaes_schedule_core,.-_vpaes_schedule_core
    794  1.1  christos 
    795  1.1  christos //
    796  1.1  christos //  .aes_schedule_192_smear
    797  1.1  christos //
    798  1.1  christos //  Smear the short, low side in the 192-bit key schedule.
    799  1.1  christos //
    800  1.1  christos //  Inputs:
    801  1.1  christos //    %xmm7: high side, b  a  x  y
    802  1.1  christos //    %xmm6:  low side, d  c  0  0
    803  1.1  christos //    %xmm13: 0
    804  1.1  christos //
    805  1.1  christos //  Outputs:
    806  1.1  christos //    %xmm6: b+c+d  b+c  0  0
    807  1.1  christos //    %xmm0: b+c+d  b+c  b  a
    808  1.1  christos //
    809  1.1  christos .type	_vpaes_schedule_192_smear,%function
    810  1.1  christos .align	4
    811  1.1  christos _vpaes_schedule_192_smear:
    812  1.1  christos 	movi	v1.16b, #0
    813  1.1  christos 	dup	v0.4s, v7.s[3]
    814  1.1  christos 	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
    815  1.1  christos 	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
    816  1.1  christos 	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
    817  1.1  christos 	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
    818  1.1  christos 	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
    819  1.1  christos 	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
    820  1.1  christos 	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
    821  1.1  christos 	ret
    822  1.1  christos .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
    823  1.1  christos 
    824  1.1  christos //
    825  1.1  christos //  .aes_schedule_round
    826  1.1  christos //
    827  1.1  christos //  Runs one main round of the key schedule on %xmm0, %xmm7
    828  1.1  christos //
    829  1.1  christos //  Specifically, runs subbytes on the high dword of %xmm0
    830  1.1  christos //  then rotates it by one byte and xors into the low dword of
    831  1.1  christos //  %xmm7.
    832  1.1  christos //
    833  1.1  christos //  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
    834  1.1  christos //  next rcon.
    835  1.1  christos //
    836  1.1  christos //  Smears the dwords of %xmm7 by xoring the low into the
    837  1.1  christos //  second low, result into third, result into highest.
    838  1.1  christos //
    839  1.1  christos //  Returns results in %xmm7 = %xmm0.
    840  1.1  christos //  Clobbers %xmm1-%xmm4, %r11.
    841  1.1  christos //
    842  1.1  christos .type	_vpaes_schedule_round,%function
    843  1.1  christos .align	4
    844  1.1  christos _vpaes_schedule_round:
    845  1.1  christos 	// extract rcon from xmm8
    846  1.1  christos 	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
    847  1.1  christos 	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
    848  1.1  christos 	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
    849  1.1  christos 	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
    850  1.1  christos 
    851  1.1  christos 	// rotate
    852  1.1  christos 	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
    853  1.1  christos 	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
    854  1.1  christos 
    855  1.1  christos 	// fall through...
    856  1.1  christos 
    857  1.1  christos 	// low round: same as high round, but no rotation and no rcon.
    858  1.1  christos _vpaes_schedule_low_round:
    859  1.1  christos 	// smear xmm7
    860  1.1  christos 	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
    861  1.1  christos 	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
    862  1.1  christos 	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
    863  1.1  christos 
    864  1.1  christos 	// subbytes
    865  1.1  christos 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
    866  1.1  christos 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
    867  1.1  christos 	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
    868  1.1  christos 	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
    869  1.1  christos 	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
    870  1.1  christos 	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
    871  1.1  christos 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
    872  1.1  christos 	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
    873  1.1  christos 	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
    874  1.1  christos 	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
    875  1.1  christos 	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
    876  1.1  christos 	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
    877  1.1  christos 	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
    878  1.1  christos 	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
    879  1.1  christos 	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
    880  1.1  christos 	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
    881  1.1  christos 	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
    882  1.1  christos 
    883  1.1  christos 	// add in smeared stuff
    884  1.1  christos 	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
    885  1.1  christos 	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
    886  1.1  christos 	ret
    887  1.1  christos .size	_vpaes_schedule_round,.-_vpaes_schedule_round
    888  1.1  christos 
    889  1.1  christos //
    890  1.1  christos //  .aes_schedule_transform
    891  1.1  christos //
    892  1.1  christos //  Linear-transform %xmm0 according to tables at (%r11)
    893  1.1  christos //
    894  1.1  christos //  Requires that %xmm9 = 0x0F0F... as in preheat
    895  1.1  christos //  Output in %xmm0
    896  1.1  christos //  Clobbers %xmm1, %xmm2
    897  1.1  christos //
    898  1.1  christos .type	_vpaes_schedule_transform,%function
    899  1.1  christos .align	4
    900  1.1  christos _vpaes_schedule_transform:
    901  1.1  christos 	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
    902  1.1  christos 	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
    903  1.1  christos 						// vmovdqa	(%r11),	%xmm2 	# lo
    904  1.1  christos 	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
    905  1.1  christos 						// vmovdqa	16(%r11),	%xmm1 # hi
    906  1.1  christos 	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
    907  1.1  christos 	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
    908  1.1  christos 	ret
    909  1.1  christos .size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
    910  1.1  christos 
    911  1.1  christos //
    912  1.1  christos //  .aes_schedule_mangle
    913  1.1  christos //
    914  1.1  christos //  Mangle xmm0 from (basis-transformed) standard version
    915  1.1  christos //  to our version.
    916  1.1  christos //
    917  1.1  christos //  On encrypt,
    918  1.1  christos //    xor with 0x63
    919  1.1  christos //    multiply by circulant 0,1,1,1
    920  1.1  christos //    apply shiftrows transform
    921  1.1  christos //
    922  1.1  christos //  On decrypt,
    923  1.1  christos //    xor with 0x63
    924  1.1  christos //    multiply by "inverse mixcolumns" circulant E,B,D,9
    925  1.1  christos //    deskew
    926  1.1  christos //    apply shiftrows transform
    927  1.1  christos //
    928  1.1  christos //
    929  1.1  christos //  Writes out to (%rdx), and increments or decrements it
    930  1.1  christos //  Keeps track of round number mod 4 in %r8
    931  1.1  christos //  Preserves xmm0
    932  1.1  christos //  Clobbers xmm1-xmm5
    933  1.1  christos //
    934  1.1  christos .type	_vpaes_schedule_mangle,%function
    935  1.1  christos .align	4
    936  1.1  christos _vpaes_schedule_mangle:
    937  1.1  christos 	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
    938  1.1  christos 						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
    939  1.1  christos 	cbnz	w3, .Lschedule_mangle_dec
    940  1.1  christos 
    941  1.1  christos 	// encrypting
    942  1.1  christos 	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
    943  1.1  christos 	add	x2, x2, #16			// add	$16,	%rdx
    944  1.1  christos 	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
    945  1.1  christos 	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
    946  1.1  christos 	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
    947  1.1  christos 	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
    948  1.1  christos 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
    949  1.1  christos 	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
    950  1.1  christos 
    951  1.1  christos 	b	.Lschedule_mangle_both
    952  1.1  christos .align	4
    953  1.1  christos .Lschedule_mangle_dec:
    954  1.1  christos 	// inverse mix columns
    955  1.1  christos 						// lea	.Lk_dksd(%rip),%r11
    956  1.1  christos 	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
    957  1.1  christos 	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
    958  1.1  christos 
    959  1.1  christos 						// vmovdqa	0x00(%r11),	%xmm2
    960  1.1  christos 	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
    961  1.1  christos 						// vmovdqa	0x10(%r11),	%xmm3
    962  1.1  christos 	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
    963  1.1  christos 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
    964  1.1  christos 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
    965  1.1  christos 
    966  1.1  christos 						// vmovdqa	0x20(%r11),	%xmm2
    967  1.1  christos 	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
    968  1.1  christos 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
    969  1.1  christos 						// vmovdqa	0x30(%r11),	%xmm3
    970  1.1  christos 	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
    971  1.1  christos 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
    972  1.1  christos 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
    973  1.1  christos 
    974  1.1  christos 						// vmovdqa	0x40(%r11),	%xmm2
    975  1.1  christos 	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
    976  1.1  christos 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
    977  1.1  christos 						// vmovdqa	0x50(%r11),	%xmm3
    978  1.1  christos 	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
    979  1.1  christos 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
    980  1.1  christos 
    981  1.1  christos 						// vmovdqa	0x60(%r11),	%xmm2
    982  1.1  christos 	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
    983  1.1  christos 	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
    984  1.1  christos 						// vmovdqa	0x70(%r11),	%xmm4
    985  1.1  christos 	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
    986  1.1  christos 	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
    987  1.1  christos 	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
    988  1.1  christos 	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
    989  1.1  christos 
    990  1.1  christos 	sub	x2, x2, #16			// add	$-16,	%rdx
    991  1.1  christos 
    992  1.1  christos .Lschedule_mangle_both:
    993  1.1  christos 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
    994  1.1  christos 	add	x8, x8, #64-16			// add	$-16,	%r8
    995  1.1  christos 	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
    996  1.1  christos 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
    997  1.1  christos 	ret
    998  1.1  christos .size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
    999  1.1  christos 
   1000  1.1  christos .globl	vpaes_set_encrypt_key
   1001  1.1  christos .type	vpaes_set_encrypt_key,%function
   1002  1.1  christos .align	4
   1003  1.1  christos vpaes_set_encrypt_key:
   1004  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
   1005  1.1  christos 	stp	x29,x30,[sp,#-16]!
   1006  1.1  christos 	add	x29,sp,#0
   1007  1.1  christos 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1008  1.1  christos 
   1009  1.1  christos 	lsr	w9, w1, #5		// shr	$5,%eax
   1010  1.1  christos 	add	w9, w9, #5		// $5,%eax
   1011  1.1  christos 	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
   1012  1.1  christos 
   1013  1.1  christos 	mov	w3, #0		// mov	$0,%ecx
   1014  1.1  christos 	mov	x8, #0x30		// mov	$0x30,%r8d
   1015  1.1  christos 	bl	_vpaes_schedule_core
   1016  1.1  christos 	eor	x0, x0, x0
   1017  1.1  christos 
   1018  1.1  christos 	ldp	d8,d9,[sp],#16
   1019  1.1  christos 	ldp	x29,x30,[sp],#16
   1020  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1021  1.1  christos 	ret
   1022  1.1  christos .size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
   1023  1.1  christos 
   1024  1.1  christos .globl	vpaes_set_decrypt_key
   1025  1.1  christos .type	vpaes_set_decrypt_key,%function
   1026  1.1  christos .align	4
   1027  1.1  christos vpaes_set_decrypt_key:
   1028  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
   1029  1.1  christos 	stp	x29,x30,[sp,#-16]!
   1030  1.1  christos 	add	x29,sp,#0
   1031  1.1  christos 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1032  1.1  christos 
   1033  1.1  christos 	lsr	w9, w1, #5		// shr	$5,%eax
   1034  1.1  christos 	add	w9, w9, #5		// $5,%eax
   1035  1.1  christos 	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
   1036  1.1  christos 	lsl	w9, w9, #4		// shl	$4,%eax
   1037  1.1  christos 	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
   1038  1.1  christos 	add	x2, x2, x9
   1039  1.1  christos 
   1040  1.1  christos 	mov	w3, #1		// mov	$1,%ecx
   1041  1.1  christos 	lsr	w8, w1, #1		// shr	$1,%r8d
   1042  1.1  christos 	and	x8, x8, #32		// and	$32,%r8d
   1043  1.1  christos 	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
   1044  1.1  christos 	bl	_vpaes_schedule_core
   1045  1.1  christos 
   1046  1.1  christos 	ldp	d8,d9,[sp],#16
   1047  1.1  christos 	ldp	x29,x30,[sp],#16
   1048  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1049  1.1  christos 	ret
   1050  1.1  christos .size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
   1051  1.1  christos .globl	vpaes_cbc_encrypt
   1052  1.1  christos .type	vpaes_cbc_encrypt,%function
   1053  1.1  christos .align	4
   1054  1.1  christos vpaes_cbc_encrypt:
   1055  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
   1056  1.1  christos 	cbz	x2, .Lcbc_abort
   1057  1.1  christos 	cmp	w5, #0			// check direction
   1058  1.1  christos 	b.eq	vpaes_cbc_decrypt
   1059  1.1  christos 
   1060  1.1  christos 	stp	x29,x30,[sp,#-16]!
   1061  1.1  christos 	add	x29,sp,#0
   1062  1.1  christos 
   1063  1.1  christos 	mov	x17, x2		// reassign
   1064  1.1  christos 	mov	x2,  x3		// reassign
   1065  1.1  christos 
   1066  1.1  christos 	ld1	{v0.16b}, [x4]	// load ivec
   1067  1.1  christos 	bl	_vpaes_encrypt_preheat
   1068  1.1  christos 	b	.Lcbc_enc_loop
   1069  1.1  christos 
   1070  1.1  christos .align	4
   1071  1.1  christos .Lcbc_enc_loop:
   1072  1.1  christos 	ld1	{v7.16b}, [x0],#16	// load input
   1073  1.1  christos 	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
   1074  1.1  christos 	bl	_vpaes_encrypt_core
   1075  1.1  christos 	st1	{v0.16b}, [x1],#16	// save output
   1076  1.1  christos 	subs	x17, x17, #16
   1077  1.1  christos 	b.hi	.Lcbc_enc_loop
   1078  1.1  christos 
   1079  1.1  christos 	st1	{v0.16b}, [x4]	// write ivec
   1080  1.1  christos 
   1081  1.1  christos 	ldp	x29,x30,[sp],#16
   1082  1.1  christos .Lcbc_abort:
   1083  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1084  1.1  christos 	ret
   1085  1.1  christos .size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
   1086  1.1  christos 
   1087  1.1  christos .type	vpaes_cbc_decrypt,%function
   1088  1.1  christos .align	4
   1089  1.1  christos vpaes_cbc_decrypt:
   1090  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
   1091  1.2  christos 	// only from vpaes_cbc_encrypt which has already signed the return address.
   1092  1.1  christos 	stp	x29,x30,[sp,#-16]!
   1093  1.1  christos 	add	x29,sp,#0
   1094  1.1  christos 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1095  1.1  christos 	stp	d10,d11,[sp,#-16]!
   1096  1.1  christos 	stp	d12,d13,[sp,#-16]!
   1097  1.1  christos 	stp	d14,d15,[sp,#-16]!
   1098  1.1  christos 
   1099  1.1  christos 	mov	x17, x2		// reassign
   1100  1.1  christos 	mov	x2,  x3		// reassign
   1101  1.1  christos 	ld1	{v6.16b}, [x4]	// load ivec
   1102  1.1  christos 	bl	_vpaes_decrypt_preheat
   1103  1.1  christos 	tst	x17, #16
   1104  1.1  christos 	b.eq	.Lcbc_dec_loop2x
   1105  1.1  christos 
   1106  1.1  christos 	ld1	{v7.16b}, [x0], #16	// load input
   1107  1.1  christos 	bl	_vpaes_decrypt_core
   1108  1.1  christos 	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
   1109  1.1  christos 	orr	v6.16b, v7.16b, v7.16b	// next ivec value
   1110  1.1  christos 	st1	{v0.16b}, [x1], #16
   1111  1.1  christos 	subs	x17, x17, #16
   1112  1.1  christos 	b.ls	.Lcbc_dec_done
   1113  1.1  christos 
   1114  1.1  christos .align	4
   1115  1.1  christos .Lcbc_dec_loop2x:
   1116  1.1  christos 	ld1	{v14.16b,v15.16b}, [x0], #32
   1117  1.1  christos 	bl	_vpaes_decrypt_2x
   1118  1.1  christos 	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
   1119  1.1  christos 	eor	v1.16b, v1.16b, v14.16b
   1120  1.1  christos 	orr	v6.16b, v15.16b, v15.16b
   1121  1.1  christos 	st1	{v0.16b,v1.16b}, [x1], #32
   1122  1.1  christos 	subs	x17, x17, #32
   1123  1.1  christos 	b.hi	.Lcbc_dec_loop2x
   1124  1.1  christos 
   1125  1.1  christos .Lcbc_dec_done:
   1126  1.1  christos 	st1	{v6.16b}, [x4]
   1127  1.1  christos 
   1128  1.1  christos 	ldp	d14,d15,[sp],#16
   1129  1.1  christos 	ldp	d12,d13,[sp],#16
   1130  1.1  christos 	ldp	d10,d11,[sp],#16
   1131  1.1  christos 	ldp	d8,d9,[sp],#16
   1132  1.1  christos 	ldp	x29,x30,[sp],#16
   1133  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1134  1.1  christos 	ret
   1135  1.1  christos .size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
   1136  1.1  christos .globl	vpaes_ecb_encrypt
   1137  1.1  christos .type	vpaes_ecb_encrypt,%function
   1138  1.1  christos .align	4
   1139  1.1  christos vpaes_ecb_encrypt:
   1140  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
   1141  1.1  christos 	stp	x29,x30,[sp,#-16]!
   1142  1.1  christos 	add	x29,sp,#0
   1143  1.1  christos 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1144  1.1  christos 	stp	d10,d11,[sp,#-16]!
   1145  1.1  christos 	stp	d12,d13,[sp,#-16]!
   1146  1.1  christos 	stp	d14,d15,[sp,#-16]!
   1147  1.1  christos 
   1148  1.1  christos 	mov	x17, x2
   1149  1.1  christos 	mov	x2,  x3
   1150  1.1  christos 	bl	_vpaes_encrypt_preheat
   1151  1.1  christos 	tst	x17, #16
   1152  1.1  christos 	b.eq	.Lecb_enc_loop
   1153  1.1  christos 
   1154  1.1  christos 	ld1	{v7.16b}, [x0],#16
   1155  1.1  christos 	bl	_vpaes_encrypt_core
   1156  1.1  christos 	st1	{v0.16b}, [x1],#16
   1157  1.1  christos 	subs	x17, x17, #16
   1158  1.1  christos 	b.ls	.Lecb_enc_done
   1159  1.1  christos 
   1160  1.1  christos .align	4
   1161  1.1  christos .Lecb_enc_loop:
   1162  1.1  christos 	ld1	{v14.16b,v15.16b}, [x0], #32
   1163  1.1  christos 	bl	_vpaes_encrypt_2x
   1164  1.1  christos 	st1	{v0.16b,v1.16b}, [x1], #32
   1165  1.1  christos 	subs	x17, x17, #32
   1166  1.1  christos 	b.hi	.Lecb_enc_loop
   1167  1.1  christos 
   1168  1.1  christos .Lecb_enc_done:
   1169  1.1  christos 	ldp	d14,d15,[sp],#16
   1170  1.1  christos 	ldp	d12,d13,[sp],#16
   1171  1.1  christos 	ldp	d10,d11,[sp],#16
   1172  1.1  christos 	ldp	d8,d9,[sp],#16
   1173  1.1  christos 	ldp	x29,x30,[sp],#16
   1174  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1175  1.1  christos 	ret
   1176  1.1  christos .size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
   1177  1.1  christos 
   1178  1.1  christos .globl	vpaes_ecb_decrypt
   1179  1.1  christos .type	vpaes_ecb_decrypt,%function
   1180  1.1  christos .align	4
   1181  1.1  christos vpaes_ecb_decrypt:
   1182  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
   1183  1.1  christos 	stp	x29,x30,[sp,#-16]!
   1184  1.1  christos 	add	x29,sp,#0
   1185  1.1  christos 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
   1186  1.1  christos 	stp	d10,d11,[sp,#-16]!
   1187  1.1  christos 	stp	d12,d13,[sp,#-16]!
   1188  1.1  christos 	stp	d14,d15,[sp,#-16]!
   1189  1.1  christos 
   1190  1.1  christos 	mov	x17, x2
   1191  1.1  christos 	mov	x2,  x3
   1192  1.1  christos 	bl	_vpaes_decrypt_preheat
   1193  1.1  christos 	tst	x17, #16
   1194  1.1  christos 	b.eq	.Lecb_dec_loop
   1195  1.1  christos 
   1196  1.1  christos 	ld1	{v7.16b}, [x0],#16
   1197  1.1  christos 	bl	_vpaes_encrypt_core
   1198  1.1  christos 	st1	{v0.16b}, [x1],#16
   1199  1.1  christos 	subs	x17, x17, #16
   1200  1.1  christos 	b.ls	.Lecb_dec_done
   1201  1.1  christos 
   1202  1.1  christos .align	4
   1203  1.1  christos .Lecb_dec_loop:
   1204  1.1  christos 	ld1	{v14.16b,v15.16b}, [x0], #32
   1205  1.1  christos 	bl	_vpaes_decrypt_2x
   1206  1.1  christos 	st1	{v0.16b,v1.16b}, [x1], #32
   1207  1.1  christos 	subs	x17, x17, #32
   1208  1.1  christos 	b.hi	.Lecb_dec_loop
   1209  1.1  christos 
   1210  1.1  christos .Lecb_dec_done:
   1211  1.1  christos 	ldp	d14,d15,[sp],#16
   1212  1.1  christos 	ldp	d12,d13,[sp],#16
   1213  1.1  christos 	ldp	d10,d11,[sp],#16
   1214  1.1  christos 	ldp	d8,d9,[sp],#16
   1215  1.1  christos 	ldp	x29,x30,[sp],#16
   1216  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1217  1.1  christos 	ret
   1218  1.1  christos .size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
   1219