Home | History | Annotate | Line # | Download | only in aarch64
      1  1.1  christos #include "arm_arch.h"
      2  1.1  christos 
      3  1.1  christos .text
      4  1.1  christos 
      5  1.1  christos // forward "declarations" are required for Apple
      6  1.1  christos 
      7  1.3  christos .hidden	OPENSSL_armcap_P
      8  1.3  christos .globl	poly1305_init
      9  1.3  christos .hidden	poly1305_init
     10  1.1  christos .globl	poly1305_blocks
     11  1.3  christos .hidden	poly1305_blocks
     12  1.1  christos .globl	poly1305_emit
     13  1.3  christos .hidden	poly1305_emit
     14  1.1  christos 
     15  1.1  christos .type	poly1305_init,%function
     16  1.1  christos .align	5
     17  1.1  christos poly1305_init:
     18  1.1  christos 	cmp	x1,xzr
     19  1.1  christos 	stp	xzr,xzr,[x0]		// zero hash value
     20  1.1  christos 	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
     21  1.1  christos 
     22  1.1  christos 	csel	x0,xzr,x0,eq
     23  1.1  christos 	b.eq	.Lno_key
     24  1.1  christos 
     25  1.4  christos 	adrp	x17,OPENSSL_armcap_P
     26  1.4  christos 	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
     27  1.1  christos 
     28  1.1  christos 	ldp	x7,x8,[x1]		// load key
     29  1.1  christos 	mov	x9,#0xfffffffc0fffffff
     30  1.1  christos 	movk	x9,#0x0fff,lsl#48
     31  1.5  christos #ifdef	__AARCH64EB__
     32  1.1  christos 	rev	x7,x7			// flip bytes
     33  1.1  christos 	rev	x8,x8
     34  1.1  christos #endif
     35  1.1  christos 	and	x7,x7,x9		// &=0ffffffc0fffffff
     36  1.1  christos 	and	x9,x9,#-4
     37  1.1  christos 	and	x8,x8,x9		// &=0ffffffc0ffffffc
     38  1.1  christos 	stp	x7,x8,[x0,#32]	// save key value
     39  1.1  christos 
     40  1.1  christos 	tst	w17,#ARMV7_NEON
     41  1.1  christos 
     42  1.4  christos 	adr	x12,.Lpoly1305_blocks
     43  1.4  christos 	adr	x7,.Lpoly1305_blocks_neon
     44  1.4  christos 	adr	x13,.Lpoly1305_emit
     45  1.4  christos 	adr	x8,.Lpoly1305_emit_neon
     46  1.1  christos 
     47  1.1  christos 	csel	x12,x12,x7,eq
     48  1.1  christos 	csel	x13,x13,x8,eq
     49  1.1  christos 
     50  1.1  christos #ifdef	__ILP32__
     51  1.1  christos 	stp	w12,w13,[x2]
     52  1.1  christos #else
     53  1.1  christos 	stp	x12,x13,[x2]
     54  1.1  christos #endif
     55  1.1  christos 
     56  1.1  christos 	mov	x0,#1
     57  1.1  christos .Lno_key:
     58  1.1  christos 	ret
     59  1.1  christos .size	poly1305_init,.-poly1305_init
     60  1.1  christos 
     61  1.1  christos .type	poly1305_blocks,%function
     62  1.1  christos .align	5
     63  1.1  christos poly1305_blocks:
     64  1.4  christos .Lpoly1305_blocks:
     65  1.1  christos 	ands	x2,x2,#-16
     66  1.1  christos 	b.eq	.Lno_data
     67  1.1  christos 
     68  1.1  christos 	ldp	x4,x5,[x0]		// load hash value
     69  1.1  christos 	ldp	x7,x8,[x0,#32]	// load key value
     70  1.1  christos 	ldr	x6,[x0,#16]
     71  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
     72  1.1  christos 	b	.Loop
     73  1.1  christos 
     74  1.1  christos .align	5
     75  1.1  christos .Loop:
     76  1.1  christos 	ldp	x10,x11,[x1],#16	// load input
     77  1.1  christos 	sub	x2,x2,#16
     78  1.5  christos #ifdef	__AARCH64EB__
     79  1.1  christos 	rev	x10,x10
     80  1.1  christos 	rev	x11,x11
     81  1.1  christos #endif
     82  1.1  christos 	adds	x4,x4,x10		// accumulate input
     83  1.1  christos 	adcs	x5,x5,x11
     84  1.1  christos 
     85  1.1  christos 	mul	x12,x4,x7		// h0*r0
     86  1.1  christos 	adc	x6,x6,x3
     87  1.1  christos 	umulh	x13,x4,x7
     88  1.1  christos 
     89  1.1  christos 	mul	x10,x5,x9		// h1*5*r1
     90  1.1  christos 	umulh	x11,x5,x9
     91  1.1  christos 
     92  1.1  christos 	adds	x12,x12,x10
     93  1.1  christos 	mul	x10,x4,x8		// h0*r1
     94  1.1  christos 	adc	x13,x13,x11
     95  1.1  christos 	umulh	x14,x4,x8
     96  1.1  christos 
     97  1.1  christos 	adds	x13,x13,x10
     98  1.1  christos 	mul	x10,x5,x7		// h1*r0
     99  1.1  christos 	adc	x14,x14,xzr
    100  1.1  christos 	umulh	x11,x5,x7
    101  1.1  christos 
    102  1.1  christos 	adds	x13,x13,x10
    103  1.1  christos 	mul	x10,x6,x9		// h2*5*r1
    104  1.1  christos 	adc	x14,x14,x11
    105  1.1  christos 	mul	x11,x6,x7		// h2*r0
    106  1.1  christos 
    107  1.1  christos 	adds	x13,x13,x10
    108  1.1  christos 	adc	x14,x14,x11
    109  1.1  christos 
    110  1.1  christos 	and	x10,x14,#-4		// final reduction
    111  1.1  christos 	and	x6,x14,#3
    112  1.1  christos 	add	x10,x10,x14,lsr#2
    113  1.1  christos 	adds	x4,x12,x10
    114  1.1  christos 	adcs	x5,x13,xzr
    115  1.1  christos 	adc	x6,x6,xzr
    116  1.1  christos 
    117  1.1  christos 	cbnz	x2,.Loop
    118  1.1  christos 
    119  1.1  christos 	stp	x4,x5,[x0]		// store hash value
    120  1.1  christos 	str	x6,[x0,#16]
    121  1.1  christos 
    122  1.1  christos .Lno_data:
    123  1.1  christos 	ret
    124  1.1  christos .size	poly1305_blocks,.-poly1305_blocks
    125  1.1  christos 
    126  1.1  christos .type	poly1305_emit,%function
    127  1.1  christos .align	5
    128  1.1  christos poly1305_emit:
    129  1.4  christos .Lpoly1305_emit:
    130  1.1  christos 	ldp	x4,x5,[x0]		// load hash base 2^64
    131  1.1  christos 	ldr	x6,[x0,#16]
    132  1.1  christos 	ldp	x10,x11,[x2]	// load nonce
    133  1.1  christos 
    134  1.1  christos 	adds	x12,x4,#5		// compare to modulus
    135  1.1  christos 	adcs	x13,x5,xzr
    136  1.1  christos 	adc	x14,x6,xzr
    137  1.1  christos 
    138  1.1  christos 	tst	x14,#-4			// see if it's carried/borrowed
    139  1.1  christos 
    140  1.1  christos 	csel	x4,x4,x12,eq
    141  1.1  christos 	csel	x5,x5,x13,eq
    142  1.1  christos 
    143  1.5  christos #ifdef	__AARCH64EB__
    144  1.1  christos 	ror	x10,x10,#32		// flip nonce words
    145  1.1  christos 	ror	x11,x11,#32
    146  1.1  christos #endif
    147  1.1  christos 	adds	x4,x4,x10		// accumulate nonce
    148  1.1  christos 	adc	x5,x5,x11
    149  1.5  christos #ifdef	__AARCH64EB__
    150  1.1  christos 	rev	x4,x4			// flip output bytes
    151  1.1  christos 	rev	x5,x5
    152  1.1  christos #endif
    153  1.1  christos 	stp	x4,x5,[x1]		// write result
    154  1.1  christos 
    155  1.1  christos 	ret
    156  1.1  christos .size	poly1305_emit,.-poly1305_emit
    157  1.1  christos .type	poly1305_mult,%function
    158  1.1  christos .align	5
    159  1.1  christos poly1305_mult:
    160  1.1  christos 	mul	x12,x4,x7		// h0*r0
    161  1.1  christos 	umulh	x13,x4,x7
    162  1.1  christos 
    163  1.1  christos 	mul	x10,x5,x9		// h1*5*r1
    164  1.1  christos 	umulh	x11,x5,x9
    165  1.1  christos 
    166  1.1  christos 	adds	x12,x12,x10
    167  1.1  christos 	mul	x10,x4,x8		// h0*r1
    168  1.1  christos 	adc	x13,x13,x11
    169  1.1  christos 	umulh	x14,x4,x8
    170  1.1  christos 
    171  1.1  christos 	adds	x13,x13,x10
    172  1.1  christos 	mul	x10,x5,x7		// h1*r0
    173  1.1  christos 	adc	x14,x14,xzr
    174  1.1  christos 	umulh	x11,x5,x7
    175  1.1  christos 
    176  1.1  christos 	adds	x13,x13,x10
    177  1.1  christos 	mul	x10,x6,x9		// h2*5*r1
    178  1.1  christos 	adc	x14,x14,x11
    179  1.1  christos 	mul	x11,x6,x7		// h2*r0
    180  1.1  christos 
    181  1.1  christos 	adds	x13,x13,x10
    182  1.1  christos 	adc	x14,x14,x11
    183  1.1  christos 
    184  1.1  christos 	and	x10,x14,#-4		// final reduction
    185  1.1  christos 	and	x6,x14,#3
    186  1.1  christos 	add	x10,x10,x14,lsr#2
    187  1.1  christos 	adds	x4,x12,x10
    188  1.1  christos 	adcs	x5,x13,xzr
    189  1.1  christos 	adc	x6,x6,xzr
    190  1.1  christos 
    191  1.1  christos 	ret
    192  1.1  christos .size	poly1305_mult,.-poly1305_mult
    193  1.1  christos 
    194  1.1  christos .type	poly1305_splat,%function
    195  1.1  christos .align	5
    196  1.1  christos poly1305_splat:
    197  1.1  christos 	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
    198  1.1  christos 	ubfx	x13,x4,#26,#26
    199  1.1  christos 	extr	x14,x5,x4,#52
    200  1.1  christos 	and	x14,x14,#0x03ffffff
    201  1.1  christos 	ubfx	x15,x5,#14,#26
    202  1.1  christos 	extr	x16,x6,x5,#40
    203  1.1  christos 
    204  1.1  christos 	str	w12,[x0,#16*0]	// r0
    205  1.1  christos 	add	w12,w13,w13,lsl#2	// r1*5
    206  1.1  christos 	str	w13,[x0,#16*1]	// r1
    207  1.1  christos 	add	w13,w14,w14,lsl#2	// r2*5
    208  1.1  christos 	str	w12,[x0,#16*2]	// s1
    209  1.1  christos 	str	w14,[x0,#16*3]	// r2
    210  1.1  christos 	add	w14,w15,w15,lsl#2	// r3*5
    211  1.1  christos 	str	w13,[x0,#16*4]	// s2
    212  1.1  christos 	str	w15,[x0,#16*5]	// r3
    213  1.1  christos 	add	w15,w16,w16,lsl#2	// r4*5
    214  1.1  christos 	str	w14,[x0,#16*6]	// s3
    215  1.1  christos 	str	w16,[x0,#16*7]	// r4
    216  1.1  christos 	str	w15,[x0,#16*8]	// s4
    217  1.1  christos 
    218  1.1  christos 	ret
    219  1.1  christos .size	poly1305_splat,.-poly1305_splat
    220  1.1  christos 
    221  1.1  christos .type	poly1305_blocks_neon,%function
    222  1.1  christos .align	5
    223  1.1  christos poly1305_blocks_neon:
    224  1.4  christos .Lpoly1305_blocks_neon:
    225  1.1  christos 	ldr	x17,[x0,#24]
    226  1.1  christos 	cmp	x2,#128
    227  1.1  christos 	b.hs	.Lblocks_neon
    228  1.4  christos 	cbz	x17,.Lpoly1305_blocks
    229  1.1  christos 
    230  1.1  christos .Lblocks_neon:
    231  1.2  christos .inst	0xd503233f		// paciasp
    232  1.1  christos 	stp	x29,x30,[sp,#-80]!
    233  1.1  christos 	add	x29,sp,#0
    234  1.1  christos 
    235  1.1  christos 	ands	x2,x2,#-16
    236  1.1  christos 	b.eq	.Lno_data_neon
    237  1.1  christos 
    238  1.1  christos 	cbz	x17,.Lbase2_64_neon
    239  1.1  christos 
    240  1.1  christos 	ldp	w10,w11,[x0]		// load hash value base 2^26
    241  1.1  christos 	ldp	w12,w13,[x0,#8]
    242  1.1  christos 	ldr	w14,[x0,#16]
    243  1.1  christos 
    244  1.1  christos 	tst	x2,#31
    245  1.1  christos 	b.eq	.Leven_neon
    246  1.1  christos 
    247  1.1  christos 	ldp	x7,x8,[x0,#32]	// load key value
    248  1.1  christos 
    249  1.1  christos 	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
    250  1.1  christos 	lsr	x5,x12,#12
    251  1.1  christos 	adds	x4,x4,x12,lsl#52
    252  1.1  christos 	add	x5,x5,x13,lsl#14
    253  1.1  christos 	adc	x5,x5,xzr
    254  1.1  christos 	lsr	x6,x14,#24
    255  1.1  christos 	adds	x5,x5,x14,lsl#40
    256  1.1  christos 	adc	x14,x6,xzr		// can be partially reduced...
    257  1.1  christos 
    258  1.1  christos 	ldp	x12,x13,[x1],#16	// load input
    259  1.1  christos 	sub	x2,x2,#16
    260  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
    261  1.1  christos 
    262  1.1  christos 	and	x10,x14,#-4		// ... so reduce
    263  1.1  christos 	and	x6,x14,#3
    264  1.1  christos 	add	x10,x10,x14,lsr#2
    265  1.1  christos 	adds	x4,x4,x10
    266  1.1  christos 	adcs	x5,x5,xzr
    267  1.1  christos 	adc	x6,x6,xzr
    268  1.1  christos 
    269  1.5  christos #ifdef	__AARCH64EB__
    270  1.1  christos 	rev	x12,x12
    271  1.1  christos 	rev	x13,x13
    272  1.1  christos #endif
    273  1.1  christos 	adds	x4,x4,x12		// accumulate input
    274  1.1  christos 	adcs	x5,x5,x13
    275  1.1  christos 	adc	x6,x6,x3
    276  1.1  christos 
    277  1.1  christos 	bl	poly1305_mult
    278  1.1  christos 	ldr	x30,[sp,#8]
    279  1.1  christos 
    280  1.1  christos 	cbz	x3,.Lstore_base2_64_neon
    281  1.1  christos 
    282  1.1  christos 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
    283  1.1  christos 	ubfx	x11,x4,#26,#26
    284  1.1  christos 	extr	x12,x5,x4,#52
    285  1.1  christos 	and	x12,x12,#0x03ffffff
    286  1.1  christos 	ubfx	x13,x5,#14,#26
    287  1.1  christos 	extr	x14,x6,x5,#40
    288  1.1  christos 
    289  1.1  christos 	cbnz	x2,.Leven_neon
    290  1.1  christos 
    291  1.1  christos 	stp	w10,w11,[x0]		// store hash value base 2^26
    292  1.1  christos 	stp	w12,w13,[x0,#8]
    293  1.1  christos 	str	w14,[x0,#16]
    294  1.1  christos 	b	.Lno_data_neon
    295  1.1  christos 
    296  1.1  christos .align	4
    297  1.1  christos .Lstore_base2_64_neon:
    298  1.1  christos 	stp	x4,x5,[x0]		// store hash value base 2^64
    299  1.1  christos 	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
    300  1.1  christos 	b	.Lno_data_neon
    301  1.1  christos 
    302  1.1  christos .align	4
    303  1.1  christos .Lbase2_64_neon:
    304  1.1  christos 	ldp	x7,x8,[x0,#32]	// load key value
    305  1.1  christos 
    306  1.1  christos 	ldp	x4,x5,[x0]		// load hash value base 2^64
    307  1.1  christos 	ldr	x6,[x0,#16]
    308  1.1  christos 
    309  1.1  christos 	tst	x2,#31
    310  1.1  christos 	b.eq	.Linit_neon
    311  1.1  christos 
    312  1.1  christos 	ldp	x12,x13,[x1],#16	// load input
    313  1.1  christos 	sub	x2,x2,#16
    314  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
    315  1.5  christos #ifdef	__AARCH64EB__
    316  1.1  christos 	rev	x12,x12
    317  1.1  christos 	rev	x13,x13
    318  1.1  christos #endif
    319  1.1  christos 	adds	x4,x4,x12		// accumulate input
    320  1.1  christos 	adcs	x5,x5,x13
    321  1.1  christos 	adc	x6,x6,x3
    322  1.1  christos 
    323  1.1  christos 	bl	poly1305_mult
    324  1.1  christos 
    325  1.1  christos .Linit_neon:
    326  1.1  christos 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
    327  1.1  christos 	ubfx	x11,x4,#26,#26
    328  1.1  christos 	extr	x12,x5,x4,#52
    329  1.1  christos 	and	x12,x12,#0x03ffffff
    330  1.1  christos 	ubfx	x13,x5,#14,#26
    331  1.1  christos 	extr	x14,x6,x5,#40
    332  1.1  christos 
    333  1.1  christos 	stp	d8,d9,[sp,#16]		// meet ABI requirements
    334  1.1  christos 	stp	d10,d11,[sp,#32]
    335  1.1  christos 	stp	d12,d13,[sp,#48]
    336  1.1  christos 	stp	d14,d15,[sp,#64]
    337  1.1  christos 
    338  1.1  christos 	fmov	d24,x10
    339  1.1  christos 	fmov	d25,x11
    340  1.1  christos 	fmov	d26,x12
    341  1.1  christos 	fmov	d27,x13
    342  1.1  christos 	fmov	d28,x14
    343  1.1  christos 
    344  1.1  christos 	////////////////////////////////// initialize r^n table
    345  1.1  christos 	mov	x4,x7			// r^1
    346  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
    347  1.1  christos 	mov	x5,x8
    348  1.1  christos 	mov	x6,xzr
    349  1.1  christos 	add	x0,x0,#48+12
    350  1.1  christos 	bl	poly1305_splat
    351  1.1  christos 
    352  1.1  christos 	bl	poly1305_mult		// r^2
    353  1.1  christos 	sub	x0,x0,#4
    354  1.1  christos 	bl	poly1305_splat
    355  1.1  christos 
    356  1.1  christos 	bl	poly1305_mult		// r^3
    357  1.1  christos 	sub	x0,x0,#4
    358  1.1  christos 	bl	poly1305_splat
    359  1.1  christos 
    360  1.1  christos 	bl	poly1305_mult		// r^4
    361  1.1  christos 	sub	x0,x0,#4
    362  1.1  christos 	bl	poly1305_splat
    363  1.1  christos 	ldr	x30,[sp,#8]
    364  1.1  christos 
    365  1.1  christos 	add	x16,x1,#32
    366  1.1  christos 	adr	x17,.Lzeros
    367  1.1  christos 	subs	x2,x2,#64
    368  1.1  christos 	csel	x16,x17,x16,lo
    369  1.1  christos 
    370  1.1  christos 	mov	x4,#1
    371  1.4  christos 	stur	x4,[x0,#-24]		// set is_base2_26
    372  1.1  christos 	sub	x0,x0,#48		// restore original x0
    373  1.1  christos 	b	.Ldo_neon
    374  1.1  christos 
    375  1.1  christos .align	4
    376  1.1  christos .Leven_neon:
    377  1.1  christos 	add	x16,x1,#32
    378  1.1  christos 	adr	x17,.Lzeros
    379  1.1  christos 	subs	x2,x2,#64
    380  1.1  christos 	csel	x16,x17,x16,lo
    381  1.1  christos 
    382  1.1  christos 	stp	d8,d9,[sp,#16]		// meet ABI requirements
    383  1.1  christos 	stp	d10,d11,[sp,#32]
    384  1.1  christos 	stp	d12,d13,[sp,#48]
    385  1.1  christos 	stp	d14,d15,[sp,#64]
    386  1.1  christos 
    387  1.1  christos 	fmov	d24,x10
    388  1.1  christos 	fmov	d25,x11
    389  1.1  christos 	fmov	d26,x12
    390  1.1  christos 	fmov	d27,x13
    391  1.1  christos 	fmov	d28,x14
    392  1.1  christos 
    393  1.1  christos .Ldo_neon:
    394  1.1  christos 	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
    395  1.1  christos 	ldp	x9,x13,[x16],#48
    396  1.1  christos 
    397  1.1  christos 	lsl	x3,x3,#24
    398  1.1  christos 	add	x15,x0,#48
    399  1.1  christos 
    400  1.5  christos #ifdef	__AARCH64EB__
    401  1.1  christos 	rev	x8,x8
    402  1.1  christos 	rev	x12,x12
    403  1.1  christos 	rev	x9,x9
    404  1.1  christos 	rev	x13,x13
    405  1.1  christos #endif
    406  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    407  1.1  christos 	and	x5,x9,#0x03ffffff
    408  1.1  christos 	ubfx	x6,x8,#26,#26
    409  1.1  christos 	ubfx	x7,x9,#26,#26
    410  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    411  1.1  christos 	extr	x8,x12,x8,#52
    412  1.1  christos 	extr	x9,x13,x9,#52
    413  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    414  1.1  christos 	fmov	d14,x4
    415  1.1  christos 	and	x8,x8,#0x03ffffff
    416  1.1  christos 	and	x9,x9,#0x03ffffff
    417  1.1  christos 	ubfx	x10,x12,#14,#26
    418  1.1  christos 	ubfx	x11,x13,#14,#26
    419  1.1  christos 	add	x12,x3,x12,lsr#40
    420  1.1  christos 	add	x13,x3,x13,lsr#40
    421  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    422  1.1  christos 	fmov	d15,x6
    423  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    424  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    425  1.1  christos 	fmov	d16,x8
    426  1.1  christos 	fmov	d17,x10
    427  1.1  christos 	fmov	d18,x12
    428  1.1  christos 
    429  1.1  christos 	ldp	x8,x12,[x1],#16	// inp[0:1]
    430  1.1  christos 	ldp	x9,x13,[x1],#48
    431  1.1  christos 
    432  1.1  christos 	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
    433  1.1  christos 	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
    434  1.1  christos 	ld1	{v8.4s},[x15]
    435  1.1  christos 
    436  1.5  christos #ifdef	__AARCH64EB__
    437  1.1  christos 	rev	x8,x8
    438  1.1  christos 	rev	x12,x12
    439  1.1  christos 	rev	x9,x9
    440  1.1  christos 	rev	x13,x13
    441  1.1  christos #endif
    442  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    443  1.1  christos 	and	x5,x9,#0x03ffffff
    444  1.1  christos 	ubfx	x6,x8,#26,#26
    445  1.1  christos 	ubfx	x7,x9,#26,#26
    446  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    447  1.1  christos 	extr	x8,x12,x8,#52
    448  1.1  christos 	extr	x9,x13,x9,#52
    449  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    450  1.1  christos 	fmov	d9,x4
    451  1.1  christos 	and	x8,x8,#0x03ffffff
    452  1.1  christos 	and	x9,x9,#0x03ffffff
    453  1.1  christos 	ubfx	x10,x12,#14,#26
    454  1.1  christos 	ubfx	x11,x13,#14,#26
    455  1.1  christos 	add	x12,x3,x12,lsr#40
    456  1.1  christos 	add	x13,x3,x13,lsr#40
    457  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    458  1.1  christos 	fmov	d10,x6
    459  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    460  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    461  1.1  christos 	movi	v31.2d,#-1
    462  1.1  christos 	fmov	d11,x8
    463  1.1  christos 	fmov	d12,x10
    464  1.1  christos 	fmov	d13,x12
    465  1.1  christos 	ushr	v31.2d,v31.2d,#38
    466  1.1  christos 
    467  1.1  christos 	b.ls	.Lskip_loop
    468  1.1  christos 
    469  1.1  christos .align	4
    470  1.1  christos .Loop_neon:
    471  1.1  christos 	////////////////////////////////////////////////////////////////
    472  1.1  christos 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
    473  1.1  christos 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
    474  1.1  christos 	//   ___________________/
    475  1.1  christos 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
    476  1.1  christos 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
    477  1.1  christos 	//   ___________________/ ____________________/
    478  1.1  christos 	//
    479  1.1  christos 	// Note that we start with inp[2:3]*r^2. This is because it
    480  1.1  christos 	// doesn't depend on reduction in previous iteration.
    481  1.1  christos 	////////////////////////////////////////////////////////////////
    482  1.1  christos 	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
    483  1.1  christos 	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
    484  1.1  christos 	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
    485  1.1  christos 	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
    486  1.1  christos 	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
    487  1.1  christos 
    488  1.1  christos 	subs	x2,x2,#64
    489  1.1  christos 	umull	v23.2d,v14.2s,v7.s[2]
    490  1.1  christos 	csel	x16,x17,x16,lo
    491  1.1  christos 	umull	v22.2d,v14.2s,v5.s[2]
    492  1.1  christos 	umull	v21.2d,v14.2s,v3.s[2]
    493  1.1  christos 	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
    494  1.1  christos 	umull	v20.2d,v14.2s,v1.s[2]
    495  1.1  christos 	ldp	x9,x13,[x16],#48
    496  1.1  christos 	umull	v19.2d,v14.2s,v0.s[2]
    497  1.5  christos #ifdef	__AARCH64EB__
    498  1.1  christos 	rev	x8,x8
    499  1.1  christos 	rev	x12,x12
    500  1.1  christos 	rev	x9,x9
    501  1.1  christos 	rev	x13,x13
    502  1.1  christos #endif
    503  1.1  christos 
    504  1.1  christos 	umlal	v23.2d,v15.2s,v5.s[2]
    505  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    506  1.1  christos 	umlal	v22.2d,v15.2s,v3.s[2]
    507  1.1  christos 	and	x5,x9,#0x03ffffff
    508  1.1  christos 	umlal	v21.2d,v15.2s,v1.s[2]
    509  1.1  christos 	ubfx	x6,x8,#26,#26
    510  1.1  christos 	umlal	v20.2d,v15.2s,v0.s[2]
    511  1.1  christos 	ubfx	x7,x9,#26,#26
    512  1.1  christos 	umlal	v19.2d,v15.2s,v8.s[2]
    513  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    514  1.1  christos 
    515  1.1  christos 	umlal	v23.2d,v16.2s,v3.s[2]
    516  1.1  christos 	extr	x8,x12,x8,#52
    517  1.1  christos 	umlal	v22.2d,v16.2s,v1.s[2]
    518  1.1  christos 	extr	x9,x13,x9,#52
    519  1.1  christos 	umlal	v21.2d,v16.2s,v0.s[2]
    520  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    521  1.1  christos 	umlal	v20.2d,v16.2s,v8.s[2]
    522  1.1  christos 	fmov	d14,x4
    523  1.1  christos 	umlal	v19.2d,v16.2s,v6.s[2]
    524  1.1  christos 	and	x8,x8,#0x03ffffff
    525  1.1  christos 
    526  1.1  christos 	umlal	v23.2d,v17.2s,v1.s[2]
    527  1.1  christos 	and	x9,x9,#0x03ffffff
    528  1.1  christos 	umlal	v22.2d,v17.2s,v0.s[2]
    529  1.1  christos 	ubfx	x10,x12,#14,#26
    530  1.1  christos 	umlal	v21.2d,v17.2s,v8.s[2]
    531  1.1  christos 	ubfx	x11,x13,#14,#26
    532  1.1  christos 	umlal	v20.2d,v17.2s,v6.s[2]
    533  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    534  1.1  christos 	umlal	v19.2d,v17.2s,v4.s[2]
    535  1.1  christos 	fmov	d15,x6
    536  1.1  christos 
    537  1.1  christos 	add	v11.2s,v11.2s,v26.2s
    538  1.1  christos 	add	x12,x3,x12,lsr#40
    539  1.1  christos 	umlal	v23.2d,v18.2s,v0.s[2]
    540  1.1  christos 	add	x13,x3,x13,lsr#40
    541  1.1  christos 	umlal	v22.2d,v18.2s,v8.s[2]
    542  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    543  1.1  christos 	umlal	v21.2d,v18.2s,v6.s[2]
    544  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    545  1.1  christos 	umlal	v20.2d,v18.2s,v4.s[2]
    546  1.1  christos 	fmov	d16,x8
    547  1.1  christos 	umlal	v19.2d,v18.2s,v2.s[2]
    548  1.1  christos 	fmov	d17,x10
    549  1.1  christos 
    550  1.1  christos 	////////////////////////////////////////////////////////////////
    551  1.1  christos 	// (hash+inp[0:1])*r^4 and accumulate
    552  1.1  christos 
    553  1.1  christos 	add	v9.2s,v9.2s,v24.2s
    554  1.1  christos 	fmov	d18,x12
    555  1.1  christos 	umlal	v22.2d,v11.2s,v1.s[0]
    556  1.1  christos 	ldp	x8,x12,[x1],#16	// inp[0:1]
    557  1.1  christos 	umlal	v19.2d,v11.2s,v6.s[0]
    558  1.1  christos 	ldp	x9,x13,[x1],#48
    559  1.1  christos 	umlal	v23.2d,v11.2s,v3.s[0]
    560  1.1  christos 	umlal	v20.2d,v11.2s,v8.s[0]
    561  1.1  christos 	umlal	v21.2d,v11.2s,v0.s[0]
    562  1.5  christos #ifdef	__AARCH64EB__
    563  1.1  christos 	rev	x8,x8
    564  1.1  christos 	rev	x12,x12
    565  1.1  christos 	rev	x9,x9
    566  1.1  christos 	rev	x13,x13
    567  1.1  christos #endif
    568  1.1  christos 
    569  1.1  christos 	add	v10.2s,v10.2s,v25.2s
    570  1.1  christos 	umlal	v22.2d,v9.2s,v5.s[0]
    571  1.1  christos 	umlal	v23.2d,v9.2s,v7.s[0]
    572  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    573  1.1  christos 	umlal	v21.2d,v9.2s,v3.s[0]
    574  1.1  christos 	and	x5,x9,#0x03ffffff
    575  1.1  christos 	umlal	v19.2d,v9.2s,v0.s[0]
    576  1.1  christos 	ubfx	x6,x8,#26,#26
    577  1.1  christos 	umlal	v20.2d,v9.2s,v1.s[0]
    578  1.1  christos 	ubfx	x7,x9,#26,#26
    579  1.1  christos 
    580  1.1  christos 	add	v12.2s,v12.2s,v27.2s
    581  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    582  1.1  christos 	umlal	v22.2d,v10.2s,v3.s[0]
    583  1.1  christos 	extr	x8,x12,x8,#52
    584  1.1  christos 	umlal	v23.2d,v10.2s,v5.s[0]
    585  1.1  christos 	extr	x9,x13,x9,#52
    586  1.1  christos 	umlal	v19.2d,v10.2s,v8.s[0]
    587  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    588  1.1  christos 	umlal	v21.2d,v10.2s,v1.s[0]
    589  1.1  christos 	fmov	d9,x4
    590  1.1  christos 	umlal	v20.2d,v10.2s,v0.s[0]
    591  1.1  christos 	and	x8,x8,#0x03ffffff
    592  1.1  christos 
    593  1.1  christos 	add	v13.2s,v13.2s,v28.2s
    594  1.1  christos 	and	x9,x9,#0x03ffffff
    595  1.1  christos 	umlal	v22.2d,v12.2s,v0.s[0]
    596  1.1  christos 	ubfx	x10,x12,#14,#26
    597  1.1  christos 	umlal	v19.2d,v12.2s,v4.s[0]
    598  1.1  christos 	ubfx	x11,x13,#14,#26
    599  1.1  christos 	umlal	v23.2d,v12.2s,v1.s[0]
    600  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    601  1.1  christos 	umlal	v20.2d,v12.2s,v6.s[0]
    602  1.1  christos 	fmov	d10,x6
    603  1.1  christos 	umlal	v21.2d,v12.2s,v8.s[0]
    604  1.1  christos 	add	x12,x3,x12,lsr#40
    605  1.1  christos 
    606  1.1  christos 	umlal	v22.2d,v13.2s,v8.s[0]
    607  1.1  christos 	add	x13,x3,x13,lsr#40
    608  1.1  christos 	umlal	v19.2d,v13.2s,v2.s[0]
    609  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    610  1.1  christos 	umlal	v23.2d,v13.2s,v0.s[0]
    611  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    612  1.1  christos 	umlal	v20.2d,v13.2s,v4.s[0]
    613  1.1  christos 	fmov	d11,x8
    614  1.1  christos 	umlal	v21.2d,v13.2s,v6.s[0]
    615  1.1  christos 	fmov	d12,x10
    616  1.1  christos 	fmov	d13,x12
    617  1.1  christos 
    618  1.1  christos 	/////////////////////////////////////////////////////////////////
    619  1.1  christos 	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
    620  1.1  christos 	// and P. Schwabe
    621  1.1  christos 	//
    622  1.1  christos 	// [see discussion in poly1305-armv4 module]
    623  1.1  christos 
    624  1.1  christos 	ushr	v29.2d,v22.2d,#26
    625  1.1  christos 	xtn	v27.2s,v22.2d
    626  1.1  christos 	ushr	v30.2d,v19.2d,#26
    627  1.1  christos 	and	v19.16b,v19.16b,v31.16b
    628  1.1  christos 	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
    629  1.1  christos 	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
    630  1.1  christos 	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
    631  1.1  christos 
    632  1.1  christos 	ushr	v29.2d,v23.2d,#26
    633  1.1  christos 	xtn	v28.2s,v23.2d
    634  1.1  christos 	ushr	v30.2d,v20.2d,#26
    635  1.1  christos 	xtn	v25.2s,v20.2d
    636  1.1  christos 	bic	v28.2s,#0xfc,lsl#24
    637  1.1  christos 	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
    638  1.1  christos 
    639  1.1  christos 	add	v19.2d,v19.2d,v29.2d
    640  1.1  christos 	shl	v29.2d,v29.2d,#2
    641  1.1  christos 	shrn	v30.2s,v21.2d,#26
    642  1.1  christos 	xtn	v26.2s,v21.2d
    643  1.1  christos 	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
    644  1.1  christos 	bic	v25.2s,#0xfc,lsl#24
    645  1.1  christos 	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
    646  1.1  christos 	bic	v26.2s,#0xfc,lsl#24
    647  1.1  christos 
    648  1.1  christos 	shrn	v29.2s,v19.2d,#26
    649  1.1  christos 	xtn	v24.2s,v19.2d
    650  1.1  christos 	ushr	v30.2s,v27.2s,#26
    651  1.1  christos 	bic	v27.2s,#0xfc,lsl#24
    652  1.1  christos 	bic	v24.2s,#0xfc,lsl#24
    653  1.1  christos 	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
    654  1.1  christos 	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
    655  1.1  christos 
    656  1.1  christos 	b.hi	.Loop_neon
    657  1.1  christos 
    658  1.1  christos .Lskip_loop:
    659  1.1  christos 	dup	v16.2d,v16.d[0]
    660  1.1  christos 	add	v11.2s,v11.2s,v26.2s
    661  1.1  christos 
    662  1.1  christos 	////////////////////////////////////////////////////////////////
    663  1.1  christos 	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
    664  1.1  christos 
    665  1.1  christos 	adds	x2,x2,#32
    666  1.1  christos 	b.ne	.Long_tail
    667  1.1  christos 
    668  1.1  christos 	dup	v16.2d,v11.d[0]
    669  1.1  christos 	add	v14.2s,v9.2s,v24.2s
    670  1.1  christos 	add	v17.2s,v12.2s,v27.2s
    671  1.1  christos 	add	v15.2s,v10.2s,v25.2s
    672  1.1  christos 	add	v18.2s,v13.2s,v28.2s
    673  1.1  christos 
    674  1.1  christos .Long_tail:
    675  1.1  christos 	dup	v14.2d,v14.d[0]
    676  1.1  christos 	umull2	v19.2d,v16.4s,v6.4s
    677  1.1  christos 	umull2	v22.2d,v16.4s,v1.4s
    678  1.1  christos 	umull2	v23.2d,v16.4s,v3.4s
    679  1.1  christos 	umull2	v21.2d,v16.4s,v0.4s
    680  1.1  christos 	umull2	v20.2d,v16.4s,v8.4s
    681  1.1  christos 
    682  1.1  christos 	dup	v15.2d,v15.d[0]
    683  1.1  christos 	umlal2	v19.2d,v14.4s,v0.4s
    684  1.1  christos 	umlal2	v21.2d,v14.4s,v3.4s
    685  1.1  christos 	umlal2	v22.2d,v14.4s,v5.4s
    686  1.1  christos 	umlal2	v23.2d,v14.4s,v7.4s
    687  1.1  christos 	umlal2	v20.2d,v14.4s,v1.4s
    688  1.1  christos 
    689  1.1  christos 	dup	v17.2d,v17.d[0]
    690  1.1  christos 	umlal2	v19.2d,v15.4s,v8.4s
    691  1.1  christos 	umlal2	v22.2d,v15.4s,v3.4s
    692  1.1  christos 	umlal2	v21.2d,v15.4s,v1.4s
    693  1.1  christos 	umlal2	v23.2d,v15.4s,v5.4s
    694  1.1  christos 	umlal2	v20.2d,v15.4s,v0.4s
    695  1.1  christos 
    696  1.1  christos 	dup	v18.2d,v18.d[0]
    697  1.1  christos 	umlal2	v22.2d,v17.4s,v0.4s
    698  1.1  christos 	umlal2	v23.2d,v17.4s,v1.4s
    699  1.1  christos 	umlal2	v19.2d,v17.4s,v4.4s
    700  1.1  christos 	umlal2	v20.2d,v17.4s,v6.4s
    701  1.1  christos 	umlal2	v21.2d,v17.4s,v8.4s
    702  1.1  christos 
    703  1.1  christos 	umlal2	v22.2d,v18.4s,v8.4s
    704  1.1  christos 	umlal2	v19.2d,v18.4s,v2.4s
    705  1.1  christos 	umlal2	v23.2d,v18.4s,v0.4s
    706  1.1  christos 	umlal2	v20.2d,v18.4s,v4.4s
    707  1.1  christos 	umlal2	v21.2d,v18.4s,v6.4s
    708  1.1  christos 
    709  1.1  christos 	b.eq	.Lshort_tail
    710  1.1  christos 
    711  1.1  christos 	////////////////////////////////////////////////////////////////
    712  1.1  christos 	// (hash+inp[0:1])*r^4:r^3 and accumulate
    713  1.1  christos 
    714  1.1  christos 	add	v9.2s,v9.2s,v24.2s
    715  1.1  christos 	umlal	v22.2d,v11.2s,v1.2s
    716  1.1  christos 	umlal	v19.2d,v11.2s,v6.2s
    717  1.1  christos 	umlal	v23.2d,v11.2s,v3.2s
    718  1.1  christos 	umlal	v20.2d,v11.2s,v8.2s
    719  1.1  christos 	umlal	v21.2d,v11.2s,v0.2s
    720  1.1  christos 
    721  1.1  christos 	add	v10.2s,v10.2s,v25.2s
    722  1.1  christos 	umlal	v22.2d,v9.2s,v5.2s
    723  1.1  christos 	umlal	v19.2d,v9.2s,v0.2s
    724  1.1  christos 	umlal	v23.2d,v9.2s,v7.2s
    725  1.1  christos 	umlal	v20.2d,v9.2s,v1.2s
    726  1.1  christos 	umlal	v21.2d,v9.2s,v3.2s
    727  1.1  christos 
    728  1.1  christos 	add	v12.2s,v12.2s,v27.2s
    729  1.1  christos 	umlal	v22.2d,v10.2s,v3.2s
    730  1.1  christos 	umlal	v19.2d,v10.2s,v8.2s
    731  1.1  christos 	umlal	v23.2d,v10.2s,v5.2s
    732  1.1  christos 	umlal	v20.2d,v10.2s,v0.2s
    733  1.1  christos 	umlal	v21.2d,v10.2s,v1.2s
    734  1.1  christos 
    735  1.1  christos 	add	v13.2s,v13.2s,v28.2s
    736  1.1  christos 	umlal	v22.2d,v12.2s,v0.2s
    737  1.1  christos 	umlal	v19.2d,v12.2s,v4.2s
    738  1.1  christos 	umlal	v23.2d,v12.2s,v1.2s
    739  1.1  christos 	umlal	v20.2d,v12.2s,v6.2s
    740  1.1  christos 	umlal	v21.2d,v12.2s,v8.2s
    741  1.1  christos 
    742  1.1  christos 	umlal	v22.2d,v13.2s,v8.2s
    743  1.1  christos 	umlal	v19.2d,v13.2s,v2.2s
    744  1.1  christos 	umlal	v23.2d,v13.2s,v0.2s
    745  1.1  christos 	umlal	v20.2d,v13.2s,v4.2s
    746  1.1  christos 	umlal	v21.2d,v13.2s,v6.2s
    747  1.1  christos 
    748  1.1  christos .Lshort_tail:
    749  1.1  christos 	////////////////////////////////////////////////////////////////
    750  1.1  christos 	// horizontal add
    751  1.1  christos 
    752  1.1  christos 	addp	v22.2d,v22.2d,v22.2d
    753  1.1  christos 	ldp	d8,d9,[sp,#16]		// meet ABI requirements
    754  1.1  christos 	addp	v19.2d,v19.2d,v19.2d
    755  1.1  christos 	ldp	d10,d11,[sp,#32]
    756  1.1  christos 	addp	v23.2d,v23.2d,v23.2d
    757  1.1  christos 	ldp	d12,d13,[sp,#48]
    758  1.1  christos 	addp	v20.2d,v20.2d,v20.2d
    759  1.1  christos 	ldp	d14,d15,[sp,#64]
    760  1.1  christos 	addp	v21.2d,v21.2d,v21.2d
    761  1.1  christos 
    762  1.1  christos 	////////////////////////////////////////////////////////////////
    763  1.1  christos 	// lazy reduction, but without narrowing
    764  1.1  christos 
    765  1.1  christos 	ushr	v29.2d,v22.2d,#26
    766  1.1  christos 	and	v22.16b,v22.16b,v31.16b
    767  1.1  christos 	ushr	v30.2d,v19.2d,#26
    768  1.1  christos 	and	v19.16b,v19.16b,v31.16b
    769  1.1  christos 
    770  1.1  christos 	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
    771  1.1  christos 	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
    772  1.1  christos 
    773  1.1  christos 	ushr	v29.2d,v23.2d,#26
    774  1.1  christos 	and	v23.16b,v23.16b,v31.16b
    775  1.1  christos 	ushr	v30.2d,v20.2d,#26
    776  1.1  christos 	and	v20.16b,v20.16b,v31.16b
    777  1.1  christos 	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
    778  1.1  christos 
    779  1.1  christos 	add	v19.2d,v19.2d,v29.2d
    780  1.1  christos 	shl	v29.2d,v29.2d,#2
    781  1.1  christos 	ushr	v30.2d,v21.2d,#26
    782  1.1  christos 	and	v21.16b,v21.16b,v31.16b
    783  1.1  christos 	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
    784  1.1  christos 	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
    785  1.1  christos 
    786  1.1  christos 	ushr	v29.2d,v19.2d,#26
    787  1.1  christos 	and	v19.16b,v19.16b,v31.16b
    788  1.1  christos 	ushr	v30.2d,v22.2d,#26
    789  1.1  christos 	and	v22.16b,v22.16b,v31.16b
    790  1.1  christos 	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
    791  1.1  christos 	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
    792  1.1  christos 
    793  1.1  christos 	////////////////////////////////////////////////////////////////
    794  1.1  christos 	// write the result, can be partially reduced
    795  1.1  christos 
    796  1.1  christos 	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
    797  1.1  christos 	st1	{v23.s}[0],[x0]
    798  1.1  christos 
    799  1.1  christos .Lno_data_neon:
    800  1.3  christos 	ldr	x29,[sp],#80
    801  1.2  christos .inst	0xd50323bf		// autiasp
    802  1.1  christos 	ret
    803  1.1  christos .size	poly1305_blocks_neon,.-poly1305_blocks_neon
    804  1.1  christos 
    805  1.1  christos .type	poly1305_emit_neon,%function
    806  1.1  christos .align	5
    807  1.1  christos poly1305_emit_neon:
    808  1.4  christos .Lpoly1305_emit_neon:
    809  1.1  christos 	ldr	x17,[x0,#24]
    810  1.1  christos 	cbz	x17,poly1305_emit
    811  1.1  christos 
    812  1.1  christos 	ldp	w10,w11,[x0]		// load hash value base 2^26
    813  1.1  christos 	ldp	w12,w13,[x0,#8]
    814  1.1  christos 	ldr	w14,[x0,#16]
    815  1.1  christos 
    816  1.1  christos 	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
    817  1.1  christos 	lsr	x5,x12,#12
    818  1.1  christos 	adds	x4,x4,x12,lsl#52
    819  1.1  christos 	add	x5,x5,x13,lsl#14
    820  1.1  christos 	adc	x5,x5,xzr
    821  1.1  christos 	lsr	x6,x14,#24
    822  1.1  christos 	adds	x5,x5,x14,lsl#40
    823  1.1  christos 	adc	x6,x6,xzr		// can be partially reduced...
    824  1.1  christos 
    825  1.1  christos 	ldp	x10,x11,[x2]	// load nonce
    826  1.1  christos 
    827  1.1  christos 	and	x12,x6,#-4		// ... so reduce
    828  1.1  christos 	add	x12,x12,x6,lsr#2
    829  1.1  christos 	and	x6,x6,#3
    830  1.1  christos 	adds	x4,x4,x12
    831  1.1  christos 	adcs	x5,x5,xzr
    832  1.1  christos 	adc	x6,x6,xzr
    833  1.1  christos 
    834  1.1  christos 	adds	x12,x4,#5		// compare to modulus
    835  1.1  christos 	adcs	x13,x5,xzr
    836  1.1  christos 	adc	x14,x6,xzr
    837  1.1  christos 
    838  1.1  christos 	tst	x14,#-4			// see if it's carried/borrowed
    839  1.1  christos 
    840  1.1  christos 	csel	x4,x4,x12,eq
    841  1.1  christos 	csel	x5,x5,x13,eq
    842  1.1  christos 
    843  1.5  christos #ifdef	__AARCH64EB__
    844  1.1  christos 	ror	x10,x10,#32		// flip nonce words
    845  1.1  christos 	ror	x11,x11,#32
    846  1.1  christos #endif
    847  1.1  christos 	adds	x4,x4,x10		// accumulate nonce
    848  1.1  christos 	adc	x5,x5,x11
    849  1.5  christos #ifdef	__AARCH64EB__
    850  1.1  christos 	rev	x4,x4			// flip output bytes
    851  1.1  christos 	rev	x5,x5
    852  1.1  christos #endif
    853  1.1  christos 	stp	x4,x5,[x1]		// write result
    854  1.1  christos 
    855  1.1  christos 	ret
    856  1.1  christos .size	poly1305_emit_neon,.-poly1305_emit_neon
    857  1.1  christos 
    858  1.1  christos .align	5
    859  1.1  christos .Lzeros:
    860  1.1  christos .long	0,0,0,0,0,0,0,0
    861  1.1  christos .byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    862  1.1  christos .align	2
    863  1.1  christos .align	2
    864