Home | History | Annotate | Line # | Download | only in aarch64
      1  1.1  christos #include "arm_arch.h"
      2  1.1  christos 
      3  1.1  christos .text
      4  1.1  christos 
      5  1.1  christos // forward "declarations" are required for Apple
      6  1.1  christos 
      7  1.1  christos .hidden	OPENSSL_armcap_P
      8  1.1  christos .globl	poly1305_init
      9  1.1  christos .hidden	poly1305_init
     10  1.1  christos .globl	poly1305_blocks
     11  1.1  christos .hidden	poly1305_blocks
     12  1.1  christos .globl	poly1305_emit
     13  1.1  christos .hidden	poly1305_emit
     14  1.1  christos 
     15  1.1  christos .type	poly1305_init,%function
     16  1.1  christos .align	5
     17  1.1  christos poly1305_init:
     18  1.1  christos 	cmp	x1,xzr
     19  1.1  christos 	stp	xzr,xzr,[x0]		// zero hash value
     20  1.1  christos 	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
     21  1.1  christos 
     22  1.1  christos 	csel	x0,xzr,x0,eq
     23  1.1  christos 	b.eq	.Lno_key
     24  1.1  christos 
     25  1.1  christos #ifdef	__ILP32__
     26  1.1  christos 	ldrsw	x11,.LOPENSSL_armcap_P
     27  1.1  christos #else
     28  1.1  christos 	ldr	x11,.LOPENSSL_armcap_P
     29  1.1  christos #endif
     30  1.1  christos 	adr	x10,.LOPENSSL_armcap_P
     31  1.1  christos 
     32  1.1  christos 	ldp	x7,x8,[x1]		// load key
     33  1.1  christos 	mov	x9,#0xfffffffc0fffffff
     34  1.1  christos 	movk	x9,#0x0fff,lsl#48
     35  1.1  christos 	ldr	w17,[x10,x11]
     36  1.1  christos #ifdef	__ARMEB__
     37  1.1  christos 	rev	x7,x7			// flip bytes
     38  1.1  christos 	rev	x8,x8
     39  1.1  christos #endif
     40  1.1  christos 	and	x7,x7,x9		// &=0ffffffc0fffffff
     41  1.1  christos 	and	x9,x9,#-4
     42  1.1  christos 	and	x8,x8,x9		// &=0ffffffc0ffffffc
     43  1.1  christos 	stp	x7,x8,[x0,#32]	// save key value
     44  1.1  christos 
     45  1.1  christos 	tst	w17,#ARMV7_NEON
     46  1.1  christos 
     47  1.1  christos 	adr	x12,poly1305_blocks
     48  1.1  christos 	adr	x7,poly1305_blocks_neon
     49  1.1  christos 	adr	x13,poly1305_emit
     50  1.1  christos 	adr	x8,poly1305_emit_neon
     51  1.1  christos 
     52  1.1  christos 	csel	x12,x12,x7,eq
     53  1.1  christos 	csel	x13,x13,x8,eq
     54  1.1  christos 
     55  1.1  christos #ifdef	__ILP32__
     56  1.1  christos 	stp	w12,w13,[x2]
     57  1.1  christos #else
     58  1.1  christos 	stp	x12,x13,[x2]
     59  1.1  christos #endif
     60  1.1  christos 
     61  1.1  christos 	mov	x0,#1
     62  1.1  christos .Lno_key:
     63  1.1  christos 	ret
     64  1.1  christos .size	poly1305_init,.-poly1305_init
     65  1.1  christos 
     66  1.1  christos .type	poly1305_blocks,%function
     67  1.1  christos .align	5
     68  1.1  christos poly1305_blocks:
     69  1.1  christos 	ands	x2,x2,#-16
     70  1.1  christos 	b.eq	.Lno_data
     71  1.1  christos 
     72  1.1  christos 	ldp	x4,x5,[x0]		// load hash value
     73  1.1  christos 	ldp	x7,x8,[x0,#32]	// load key value
     74  1.1  christos 	ldr	x6,[x0,#16]
     75  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
     76  1.1  christos 	b	.Loop
     77  1.1  christos 
     78  1.1  christos .align	5
     79  1.1  christos .Loop:
     80  1.1  christos 	ldp	x10,x11,[x1],#16	// load input
     81  1.1  christos 	sub	x2,x2,#16
     82  1.1  christos #ifdef	__ARMEB__
     83  1.1  christos 	rev	x10,x10
     84  1.1  christos 	rev	x11,x11
     85  1.1  christos #endif
     86  1.1  christos 	adds	x4,x4,x10		// accumulate input
     87  1.1  christos 	adcs	x5,x5,x11
     88  1.1  christos 
     89  1.1  christos 	mul	x12,x4,x7		// h0*r0
     90  1.1  christos 	adc	x6,x6,x3
     91  1.1  christos 	umulh	x13,x4,x7
     92  1.1  christos 
     93  1.1  christos 	mul	x10,x5,x9		// h1*5*r1
     94  1.1  christos 	umulh	x11,x5,x9
     95  1.1  christos 
     96  1.1  christos 	adds	x12,x12,x10
     97  1.1  christos 	mul	x10,x4,x8		// h0*r1
     98  1.1  christos 	adc	x13,x13,x11
     99  1.1  christos 	umulh	x14,x4,x8
    100  1.1  christos 
    101  1.1  christos 	adds	x13,x13,x10
    102  1.1  christos 	mul	x10,x5,x7		// h1*r0
    103  1.1  christos 	adc	x14,x14,xzr
    104  1.1  christos 	umulh	x11,x5,x7
    105  1.1  christos 
    106  1.1  christos 	adds	x13,x13,x10
    107  1.1  christos 	mul	x10,x6,x9		// h2*5*r1
    108  1.1  christos 	adc	x14,x14,x11
    109  1.1  christos 	mul	x11,x6,x7		// h2*r0
    110  1.1  christos 
    111  1.1  christos 	adds	x13,x13,x10
    112  1.1  christos 	adc	x14,x14,x11
    113  1.1  christos 
    114  1.1  christos 	and	x10,x14,#-4		// final reduction
    115  1.1  christos 	and	x6,x14,#3
    116  1.1  christos 	add	x10,x10,x14,lsr#2
    117  1.1  christos 	adds	x4,x12,x10
    118  1.1  christos 	adcs	x5,x13,xzr
    119  1.1  christos 	adc	x6,x6,xzr
    120  1.1  christos 
    121  1.1  christos 	cbnz	x2,.Loop
    122  1.1  christos 
    123  1.1  christos 	stp	x4,x5,[x0]		// store hash value
    124  1.1  christos 	str	x6,[x0,#16]
    125  1.1  christos 
    126  1.1  christos .Lno_data:
    127  1.1  christos 	ret
    128  1.1  christos .size	poly1305_blocks,.-poly1305_blocks
    129  1.1  christos 
    130  1.1  christos .type	poly1305_emit,%function
    131  1.1  christos .align	5
    132  1.1  christos poly1305_emit:
    133  1.1  christos 	ldp	x4,x5,[x0]		// load hash base 2^64
    134  1.1  christos 	ldr	x6,[x0,#16]
    135  1.1  christos 	ldp	x10,x11,[x2]	// load nonce
    136  1.1  christos 
    137  1.1  christos 	adds	x12,x4,#5		// compare to modulus
    138  1.1  christos 	adcs	x13,x5,xzr
    139  1.1  christos 	adc	x14,x6,xzr
    140  1.1  christos 
    141  1.1  christos 	tst	x14,#-4			// see if it's carried/borrowed
    142  1.1  christos 
    143  1.1  christos 	csel	x4,x4,x12,eq
    144  1.1  christos 	csel	x5,x5,x13,eq
    145  1.1  christos 
    146  1.1  christos #ifdef	__ARMEB__
    147  1.1  christos 	ror	x10,x10,#32		// flip nonce words
    148  1.1  christos 	ror	x11,x11,#32
    149  1.1  christos #endif
    150  1.1  christos 	adds	x4,x4,x10		// accumulate nonce
    151  1.1  christos 	adc	x5,x5,x11
    152  1.1  christos #ifdef	__ARMEB__
    153  1.1  christos 	rev	x4,x4			// flip output bytes
    154  1.1  christos 	rev	x5,x5
    155  1.1  christos #endif
    156  1.1  christos 	stp	x4,x5,[x1]		// write result
    157  1.1  christos 
    158  1.1  christos 	ret
    159  1.1  christos .size	poly1305_emit,.-poly1305_emit
    160  1.1  christos .type	poly1305_mult,%function
    161  1.1  christos .align	5
    162  1.1  christos poly1305_mult:
    163  1.1  christos 	mul	x12,x4,x7		// h0*r0
    164  1.1  christos 	umulh	x13,x4,x7
    165  1.1  christos 
    166  1.1  christos 	mul	x10,x5,x9		// h1*5*r1
    167  1.1  christos 	umulh	x11,x5,x9
    168  1.1  christos 
    169  1.1  christos 	adds	x12,x12,x10
    170  1.1  christos 	mul	x10,x4,x8		// h0*r1
    171  1.1  christos 	adc	x13,x13,x11
    172  1.1  christos 	umulh	x14,x4,x8
    173  1.1  christos 
    174  1.1  christos 	adds	x13,x13,x10
    175  1.1  christos 	mul	x10,x5,x7		// h1*r0
    176  1.1  christos 	adc	x14,x14,xzr
    177  1.1  christos 	umulh	x11,x5,x7
    178  1.1  christos 
    179  1.1  christos 	adds	x13,x13,x10
    180  1.1  christos 	mul	x10,x6,x9		// h2*5*r1
    181  1.1  christos 	adc	x14,x14,x11
    182  1.1  christos 	mul	x11,x6,x7		// h2*r0
    183  1.1  christos 
    184  1.1  christos 	adds	x13,x13,x10
    185  1.1  christos 	adc	x14,x14,x11
    186  1.1  christos 
    187  1.1  christos 	and	x10,x14,#-4		// final reduction
    188  1.1  christos 	and	x6,x14,#3
    189  1.1  christos 	add	x10,x10,x14,lsr#2
    190  1.1  christos 	adds	x4,x12,x10
    191  1.1  christos 	adcs	x5,x13,xzr
    192  1.1  christos 	adc	x6,x6,xzr
    193  1.1  christos 
    194  1.1  christos 	ret
    195  1.1  christos .size	poly1305_mult,.-poly1305_mult
    196  1.1  christos 
    197  1.1  christos .type	poly1305_splat,%function
    198  1.1  christos .align	5
    199  1.1  christos poly1305_splat:
    200  1.1  christos 	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
    201  1.1  christos 	ubfx	x13,x4,#26,#26
    202  1.1  christos 	extr	x14,x5,x4,#52
    203  1.1  christos 	and	x14,x14,#0x03ffffff
    204  1.1  christos 	ubfx	x15,x5,#14,#26
    205  1.1  christos 	extr	x16,x6,x5,#40
    206  1.1  christos 
    207  1.1  christos 	str	w12,[x0,#16*0]	// r0
    208  1.1  christos 	add	w12,w13,w13,lsl#2	// r1*5
    209  1.1  christos 	str	w13,[x0,#16*1]	// r1
    210  1.1  christos 	add	w13,w14,w14,lsl#2	// r2*5
    211  1.1  christos 	str	w12,[x0,#16*2]	// s1
    212  1.1  christos 	str	w14,[x0,#16*3]	// r2
    213  1.1  christos 	add	w14,w15,w15,lsl#2	// r3*5
    214  1.1  christos 	str	w13,[x0,#16*4]	// s2
    215  1.1  christos 	str	w15,[x0,#16*5]	// r3
    216  1.1  christos 	add	w15,w16,w16,lsl#2	// r4*5
    217  1.1  christos 	str	w14,[x0,#16*6]	// s3
    218  1.1  christos 	str	w16,[x0,#16*7]	// r4
    219  1.1  christos 	str	w15,[x0,#16*8]	// s4
    220  1.1  christos 
    221  1.1  christos 	ret
    222  1.1  christos .size	poly1305_splat,.-poly1305_splat
    223  1.1  christos 
    224  1.1  christos .type	poly1305_blocks_neon,%function
    225  1.1  christos .align	5
    226  1.1  christos poly1305_blocks_neon:
    227  1.1  christos 	ldr	x17,[x0,#24]
    228  1.1  christos 	cmp	x2,#128
    229  1.1  christos 	b.hs	.Lblocks_neon
    230  1.1  christos 	cbz	x17,poly1305_blocks
    231  1.1  christos 
    232  1.1  christos .Lblocks_neon:
    233  1.1  christos .inst	0xd503233f		// paciasp
    234  1.1  christos 	stp	x29,x30,[sp,#-80]!
    235  1.1  christos 	add	x29,sp,#0
    236  1.1  christos 
    237  1.1  christos 	ands	x2,x2,#-16
    238  1.1  christos 	b.eq	.Lno_data_neon
    239  1.1  christos 
    240  1.1  christos 	cbz	x17,.Lbase2_64_neon
    241  1.1  christos 
    242  1.1  christos 	ldp	w10,w11,[x0]		// load hash value base 2^26
    243  1.1  christos 	ldp	w12,w13,[x0,#8]
    244  1.1  christos 	ldr	w14,[x0,#16]
    245  1.1  christos 
    246  1.1  christos 	tst	x2,#31
    247  1.1  christos 	b.eq	.Leven_neon
    248  1.1  christos 
    249  1.1  christos 	ldp	x7,x8,[x0,#32]	// load key value
    250  1.1  christos 
    251  1.1  christos 	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
    252  1.1  christos 	lsr	x5,x12,#12
    253  1.1  christos 	adds	x4,x4,x12,lsl#52
    254  1.1  christos 	add	x5,x5,x13,lsl#14
    255  1.1  christos 	adc	x5,x5,xzr
    256  1.1  christos 	lsr	x6,x14,#24
    257  1.1  christos 	adds	x5,x5,x14,lsl#40
    258  1.1  christos 	adc	x14,x6,xzr		// can be partially reduced...
    259  1.1  christos 
    260  1.1  christos 	ldp	x12,x13,[x1],#16	// load input
    261  1.1  christos 	sub	x2,x2,#16
    262  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
    263  1.1  christos 
    264  1.1  christos 	and	x10,x14,#-4		// ... so reduce
    265  1.1  christos 	and	x6,x14,#3
    266  1.1  christos 	add	x10,x10,x14,lsr#2
    267  1.1  christos 	adds	x4,x4,x10
    268  1.1  christos 	adcs	x5,x5,xzr
    269  1.1  christos 	adc	x6,x6,xzr
    270  1.1  christos 
    271  1.1  christos #ifdef	__ARMEB__
    272  1.1  christos 	rev	x12,x12
    273  1.1  christos 	rev	x13,x13
    274  1.1  christos #endif
    275  1.1  christos 	adds	x4,x4,x12		// accumulate input
    276  1.1  christos 	adcs	x5,x5,x13
    277  1.1  christos 	adc	x6,x6,x3
    278  1.1  christos 
    279  1.1  christos 	bl	poly1305_mult
    280  1.1  christos 	ldr	x30,[sp,#8]
    281  1.1  christos 
    282  1.1  christos 	cbz	x3,.Lstore_base2_64_neon
    283  1.1  christos 
    284  1.1  christos 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
    285  1.1  christos 	ubfx	x11,x4,#26,#26
    286  1.1  christos 	extr	x12,x5,x4,#52
    287  1.1  christos 	and	x12,x12,#0x03ffffff
    288  1.1  christos 	ubfx	x13,x5,#14,#26
    289  1.1  christos 	extr	x14,x6,x5,#40
    290  1.1  christos 
    291  1.1  christos 	cbnz	x2,.Leven_neon
    292  1.1  christos 
    293  1.1  christos 	stp	w10,w11,[x0]		// store hash value base 2^26
    294  1.1  christos 	stp	w12,w13,[x0,#8]
    295  1.1  christos 	str	w14,[x0,#16]
    296  1.1  christos 	b	.Lno_data_neon
    297  1.1  christos 
    298  1.1  christos .align	4
    299  1.1  christos .Lstore_base2_64_neon:
    300  1.1  christos 	stp	x4,x5,[x0]		// store hash value base 2^64
    301  1.1  christos 	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
    302  1.1  christos 	b	.Lno_data_neon
    303  1.1  christos 
    304  1.1  christos .align	4
    305  1.1  christos .Lbase2_64_neon:
    306  1.1  christos 	ldp	x7,x8,[x0,#32]	// load key value
    307  1.1  christos 
    308  1.1  christos 	ldp	x4,x5,[x0]		// load hash value base 2^64
    309  1.1  christos 	ldr	x6,[x0,#16]
    310  1.1  christos 
    311  1.1  christos 	tst	x2,#31
    312  1.1  christos 	b.eq	.Linit_neon
    313  1.1  christos 
    314  1.1  christos 	ldp	x12,x13,[x1],#16	// load input
    315  1.1  christos 	sub	x2,x2,#16
    316  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
    317  1.1  christos #ifdef	__ARMEB__
    318  1.1  christos 	rev	x12,x12
    319  1.1  christos 	rev	x13,x13
    320  1.1  christos #endif
    321  1.1  christos 	adds	x4,x4,x12		// accumulate input
    322  1.1  christos 	adcs	x5,x5,x13
    323  1.1  christos 	adc	x6,x6,x3
    324  1.1  christos 
    325  1.1  christos 	bl	poly1305_mult
    326  1.1  christos 
    327  1.1  christos .Linit_neon:
    328  1.1  christos 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
    329  1.1  christos 	ubfx	x11,x4,#26,#26
    330  1.1  christos 	extr	x12,x5,x4,#52
    331  1.1  christos 	and	x12,x12,#0x03ffffff
    332  1.1  christos 	ubfx	x13,x5,#14,#26
    333  1.1  christos 	extr	x14,x6,x5,#40
    334  1.1  christos 
    335  1.1  christos 	stp	d8,d9,[sp,#16]		// meet ABI requirements
    336  1.1  christos 	stp	d10,d11,[sp,#32]
    337  1.1  christos 	stp	d12,d13,[sp,#48]
    338  1.1  christos 	stp	d14,d15,[sp,#64]
    339  1.1  christos 
    340  1.1  christos 	fmov	d24,x10
    341  1.1  christos 	fmov	d25,x11
    342  1.1  christos 	fmov	d26,x12
    343  1.1  christos 	fmov	d27,x13
    344  1.1  christos 	fmov	d28,x14
    345  1.1  christos 
    346  1.1  christos 	////////////////////////////////// initialize r^n table
    347  1.1  christos 	mov	x4,x7			// r^1
    348  1.1  christos 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
    349  1.1  christos 	mov	x5,x8
    350  1.1  christos 	mov	x6,xzr
    351  1.1  christos 	add	x0,x0,#48+12
    352  1.1  christos 	bl	poly1305_splat
    353  1.1  christos 
    354  1.1  christos 	bl	poly1305_mult		// r^2
    355  1.1  christos 	sub	x0,x0,#4
    356  1.1  christos 	bl	poly1305_splat
    357  1.1  christos 
    358  1.1  christos 	bl	poly1305_mult		// r^3
    359  1.1  christos 	sub	x0,x0,#4
    360  1.1  christos 	bl	poly1305_splat
    361  1.1  christos 
    362  1.1  christos 	bl	poly1305_mult		// r^4
    363  1.1  christos 	sub	x0,x0,#4
    364  1.1  christos 	bl	poly1305_splat
    365  1.1  christos 	ldr	x30,[sp,#8]
    366  1.1  christos 
    367  1.1  christos 	add	x16,x1,#32
    368  1.1  christos 	adr	x17,.Lzeros
    369  1.1  christos 	subs	x2,x2,#64
    370  1.1  christos 	csel	x16,x17,x16,lo
    371  1.1  christos 
    372  1.1  christos 	mov	x4,#1
    373  1.1  christos 	str	x4,[x0,#-24]		// set is_base2_26
    374  1.1  christos 	sub	x0,x0,#48		// restore original x0
    375  1.1  christos 	b	.Ldo_neon
    376  1.1  christos 
    377  1.1  christos .align	4
    378  1.1  christos .Leven_neon:
    379  1.1  christos 	add	x16,x1,#32
    380  1.1  christos 	adr	x17,.Lzeros
    381  1.1  christos 	subs	x2,x2,#64
    382  1.1  christos 	csel	x16,x17,x16,lo
    383  1.1  christos 
    384  1.1  christos 	stp	d8,d9,[sp,#16]		// meet ABI requirements
    385  1.1  christos 	stp	d10,d11,[sp,#32]
    386  1.1  christos 	stp	d12,d13,[sp,#48]
    387  1.1  christos 	stp	d14,d15,[sp,#64]
    388  1.1  christos 
    389  1.1  christos 	fmov	d24,x10
    390  1.1  christos 	fmov	d25,x11
    391  1.1  christos 	fmov	d26,x12
    392  1.1  christos 	fmov	d27,x13
    393  1.1  christos 	fmov	d28,x14
    394  1.1  christos 
    395  1.1  christos .Ldo_neon:
    396  1.1  christos 	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
    397  1.1  christos 	ldp	x9,x13,[x16],#48
    398  1.1  christos 
    399  1.1  christos 	lsl	x3,x3,#24
    400  1.1  christos 	add	x15,x0,#48
    401  1.1  christos 
    402  1.1  christos #ifdef	__ARMEB__
    403  1.1  christos 	rev	x8,x8
    404  1.1  christos 	rev	x12,x12
    405  1.1  christos 	rev	x9,x9
    406  1.1  christos 	rev	x13,x13
    407  1.1  christos #endif
    408  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    409  1.1  christos 	and	x5,x9,#0x03ffffff
    410  1.1  christos 	ubfx	x6,x8,#26,#26
    411  1.1  christos 	ubfx	x7,x9,#26,#26
    412  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    413  1.1  christos 	extr	x8,x12,x8,#52
    414  1.1  christos 	extr	x9,x13,x9,#52
    415  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    416  1.1  christos 	fmov	d14,x4
    417  1.1  christos 	and	x8,x8,#0x03ffffff
    418  1.1  christos 	and	x9,x9,#0x03ffffff
    419  1.1  christos 	ubfx	x10,x12,#14,#26
    420  1.1  christos 	ubfx	x11,x13,#14,#26
    421  1.1  christos 	add	x12,x3,x12,lsr#40
    422  1.1  christos 	add	x13,x3,x13,lsr#40
    423  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    424  1.1  christos 	fmov	d15,x6
    425  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    426  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    427  1.1  christos 	fmov	d16,x8
    428  1.1  christos 	fmov	d17,x10
    429  1.1  christos 	fmov	d18,x12
    430  1.1  christos 
    431  1.1  christos 	ldp	x8,x12,[x1],#16	// inp[0:1]
    432  1.1  christos 	ldp	x9,x13,[x1],#48
    433  1.1  christos 
    434  1.1  christos 	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
    435  1.1  christos 	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
    436  1.1  christos 	ld1	{v8.4s},[x15]
    437  1.1  christos 
    438  1.1  christos #ifdef	__ARMEB__
    439  1.1  christos 	rev	x8,x8
    440  1.1  christos 	rev	x12,x12
    441  1.1  christos 	rev	x9,x9
    442  1.1  christos 	rev	x13,x13
    443  1.1  christos #endif
    444  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    445  1.1  christos 	and	x5,x9,#0x03ffffff
    446  1.1  christos 	ubfx	x6,x8,#26,#26
    447  1.1  christos 	ubfx	x7,x9,#26,#26
    448  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    449  1.1  christos 	extr	x8,x12,x8,#52
    450  1.1  christos 	extr	x9,x13,x9,#52
    451  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    452  1.1  christos 	fmov	d9,x4
    453  1.1  christos 	and	x8,x8,#0x03ffffff
    454  1.1  christos 	and	x9,x9,#0x03ffffff
    455  1.1  christos 	ubfx	x10,x12,#14,#26
    456  1.1  christos 	ubfx	x11,x13,#14,#26
    457  1.1  christos 	add	x12,x3,x12,lsr#40
    458  1.1  christos 	add	x13,x3,x13,lsr#40
    459  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    460  1.1  christos 	fmov	d10,x6
    461  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    462  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    463  1.1  christos 	movi	v31.2d,#-1
    464  1.1  christos 	fmov	d11,x8
    465  1.1  christos 	fmov	d12,x10
    466  1.1  christos 	fmov	d13,x12
    467  1.1  christos 	ushr	v31.2d,v31.2d,#38
    468  1.1  christos 
    469  1.1  christos 	b.ls	.Lskip_loop
    470  1.1  christos 
    471  1.1  christos .align	4
    472  1.1  christos .Loop_neon:
    473  1.1  christos 	////////////////////////////////////////////////////////////////
    474  1.1  christos 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
    475  1.1  christos 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
    476  1.1  christos 	//   ___________________/
    477  1.1  christos 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
    478  1.1  christos 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
    479  1.1  christos 	//   ___________________/ ____________________/
    480  1.1  christos 	//
    481  1.1  christos 	// Note that we start with inp[2:3]*r^2. This is because it
    482  1.1  christos 	// doesn't depend on reduction in previous iteration.
    483  1.1  christos 	////////////////////////////////////////////////////////////////
    484  1.1  christos 	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
    485  1.1  christos 	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
    486  1.1  christos 	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
    487  1.1  christos 	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
    488  1.1  christos 	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
    489  1.1  christos 
    490  1.1  christos 	subs	x2,x2,#64
    491  1.1  christos 	umull	v23.2d,v14.2s,v7.s[2]
    492  1.1  christos 	csel	x16,x17,x16,lo
    493  1.1  christos 	umull	v22.2d,v14.2s,v5.s[2]
    494  1.1  christos 	umull	v21.2d,v14.2s,v3.s[2]
    495  1.1  christos 	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
    496  1.1  christos 	umull	v20.2d,v14.2s,v1.s[2]
    497  1.1  christos 	ldp	x9,x13,[x16],#48
    498  1.1  christos 	umull	v19.2d,v14.2s,v0.s[2]
    499  1.1  christos #ifdef	__ARMEB__
    500  1.1  christos 	rev	x8,x8
    501  1.1  christos 	rev	x12,x12
    502  1.1  christos 	rev	x9,x9
    503  1.1  christos 	rev	x13,x13
    504  1.1  christos #endif
    505  1.1  christos 
    506  1.1  christos 	umlal	v23.2d,v15.2s,v5.s[2]
    507  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    508  1.1  christos 	umlal	v22.2d,v15.2s,v3.s[2]
    509  1.1  christos 	and	x5,x9,#0x03ffffff
    510  1.1  christos 	umlal	v21.2d,v15.2s,v1.s[2]
    511  1.1  christos 	ubfx	x6,x8,#26,#26
    512  1.1  christos 	umlal	v20.2d,v15.2s,v0.s[2]
    513  1.1  christos 	ubfx	x7,x9,#26,#26
    514  1.1  christos 	umlal	v19.2d,v15.2s,v8.s[2]
    515  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    516  1.1  christos 
    517  1.1  christos 	umlal	v23.2d,v16.2s,v3.s[2]
    518  1.1  christos 	extr	x8,x12,x8,#52
    519  1.1  christos 	umlal	v22.2d,v16.2s,v1.s[2]
    520  1.1  christos 	extr	x9,x13,x9,#52
    521  1.1  christos 	umlal	v21.2d,v16.2s,v0.s[2]
    522  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    523  1.1  christos 	umlal	v20.2d,v16.2s,v8.s[2]
    524  1.1  christos 	fmov	d14,x4
    525  1.1  christos 	umlal	v19.2d,v16.2s,v6.s[2]
    526  1.1  christos 	and	x8,x8,#0x03ffffff
    527  1.1  christos 
    528  1.1  christos 	umlal	v23.2d,v17.2s,v1.s[2]
    529  1.1  christos 	and	x9,x9,#0x03ffffff
    530  1.1  christos 	umlal	v22.2d,v17.2s,v0.s[2]
    531  1.1  christos 	ubfx	x10,x12,#14,#26
    532  1.1  christos 	umlal	v21.2d,v17.2s,v8.s[2]
    533  1.1  christos 	ubfx	x11,x13,#14,#26
    534  1.1  christos 	umlal	v20.2d,v17.2s,v6.s[2]
    535  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    536  1.1  christos 	umlal	v19.2d,v17.2s,v4.s[2]
    537  1.1  christos 	fmov	d15,x6
    538  1.1  christos 
    539  1.1  christos 	add	v11.2s,v11.2s,v26.2s
    540  1.1  christos 	add	x12,x3,x12,lsr#40
    541  1.1  christos 	umlal	v23.2d,v18.2s,v0.s[2]
    542  1.1  christos 	add	x13,x3,x13,lsr#40
    543  1.1  christos 	umlal	v22.2d,v18.2s,v8.s[2]
    544  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    545  1.1  christos 	umlal	v21.2d,v18.2s,v6.s[2]
    546  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    547  1.1  christos 	umlal	v20.2d,v18.2s,v4.s[2]
    548  1.1  christos 	fmov	d16,x8
    549  1.1  christos 	umlal	v19.2d,v18.2s,v2.s[2]
    550  1.1  christos 	fmov	d17,x10
    551  1.1  christos 
    552  1.1  christos 	////////////////////////////////////////////////////////////////
    553  1.1  christos 	// (hash+inp[0:1])*r^4 and accumulate
    554  1.1  christos 
    555  1.1  christos 	add	v9.2s,v9.2s,v24.2s
    556  1.1  christos 	fmov	d18,x12
    557  1.1  christos 	umlal	v22.2d,v11.2s,v1.s[0]
    558  1.1  christos 	ldp	x8,x12,[x1],#16	// inp[0:1]
    559  1.1  christos 	umlal	v19.2d,v11.2s,v6.s[0]
    560  1.1  christos 	ldp	x9,x13,[x1],#48
    561  1.1  christos 	umlal	v23.2d,v11.2s,v3.s[0]
    562  1.1  christos 	umlal	v20.2d,v11.2s,v8.s[0]
    563  1.1  christos 	umlal	v21.2d,v11.2s,v0.s[0]
    564  1.1  christos #ifdef	__ARMEB__
    565  1.1  christos 	rev	x8,x8
    566  1.1  christos 	rev	x12,x12
    567  1.1  christos 	rev	x9,x9
    568  1.1  christos 	rev	x13,x13
    569  1.1  christos #endif
    570  1.1  christos 
    571  1.1  christos 	add	v10.2s,v10.2s,v25.2s
    572  1.1  christos 	umlal	v22.2d,v9.2s,v5.s[0]
    573  1.1  christos 	umlal	v23.2d,v9.2s,v7.s[0]
    574  1.1  christos 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
    575  1.1  christos 	umlal	v21.2d,v9.2s,v3.s[0]
    576  1.1  christos 	and	x5,x9,#0x03ffffff
    577  1.1  christos 	umlal	v19.2d,v9.2s,v0.s[0]
    578  1.1  christos 	ubfx	x6,x8,#26,#26
    579  1.1  christos 	umlal	v20.2d,v9.2s,v1.s[0]
    580  1.1  christos 	ubfx	x7,x9,#26,#26
    581  1.1  christos 
    582  1.1  christos 	add	v12.2s,v12.2s,v27.2s
    583  1.1  christos 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
    584  1.1  christos 	umlal	v22.2d,v10.2s,v3.s[0]
    585  1.1  christos 	extr	x8,x12,x8,#52
    586  1.1  christos 	umlal	v23.2d,v10.2s,v5.s[0]
    587  1.1  christos 	extr	x9,x13,x9,#52
    588  1.1  christos 	umlal	v19.2d,v10.2s,v8.s[0]
    589  1.1  christos 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
    590  1.1  christos 	umlal	v21.2d,v10.2s,v1.s[0]
    591  1.1  christos 	fmov	d9,x4
    592  1.1  christos 	umlal	v20.2d,v10.2s,v0.s[0]
    593  1.1  christos 	and	x8,x8,#0x03ffffff
    594  1.1  christos 
    595  1.1  christos 	add	v13.2s,v13.2s,v28.2s
    596  1.1  christos 	and	x9,x9,#0x03ffffff
    597  1.1  christos 	umlal	v22.2d,v12.2s,v0.s[0]
    598  1.1  christos 	ubfx	x10,x12,#14,#26
    599  1.1  christos 	umlal	v19.2d,v12.2s,v4.s[0]
    600  1.1  christos 	ubfx	x11,x13,#14,#26
    601  1.1  christos 	umlal	v23.2d,v12.2s,v1.s[0]
    602  1.1  christos 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
    603  1.1  christos 	umlal	v20.2d,v12.2s,v6.s[0]
    604  1.1  christos 	fmov	d10,x6
    605  1.1  christos 	umlal	v21.2d,v12.2s,v8.s[0]
    606  1.1  christos 	add	x12,x3,x12,lsr#40
    607  1.1  christos 
    608  1.1  christos 	umlal	v22.2d,v13.2s,v8.s[0]
    609  1.1  christos 	add	x13,x3,x13,lsr#40
    610  1.1  christos 	umlal	v19.2d,v13.2s,v2.s[0]
    611  1.1  christos 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
    612  1.1  christos 	umlal	v23.2d,v13.2s,v0.s[0]
    613  1.1  christos 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
    614  1.1  christos 	umlal	v20.2d,v13.2s,v4.s[0]
    615  1.1  christos 	fmov	d11,x8
    616  1.1  christos 	umlal	v21.2d,v13.2s,v6.s[0]
    617  1.1  christos 	fmov	d12,x10
    618  1.1  christos 	fmov	d13,x12
    619  1.1  christos 
    620  1.1  christos 	/////////////////////////////////////////////////////////////////
    621  1.1  christos 	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
    622  1.1  christos 	// and P. Schwabe
    623  1.1  christos 	//
    624  1.1  christos 	// [see discussion in poly1305-armv4 module]
    625  1.1  christos 
    626  1.1  christos 	ushr	v29.2d,v22.2d,#26
    627  1.1  christos 	xtn	v27.2s,v22.2d
    628  1.1  christos 	ushr	v30.2d,v19.2d,#26
    629  1.1  christos 	and	v19.16b,v19.16b,v31.16b
    630  1.1  christos 	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
    631  1.1  christos 	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
    632  1.1  christos 	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
    633  1.1  christos 
    634  1.1  christos 	ushr	v29.2d,v23.2d,#26
    635  1.1  christos 	xtn	v28.2s,v23.2d
    636  1.1  christos 	ushr	v30.2d,v20.2d,#26
    637  1.1  christos 	xtn	v25.2s,v20.2d
    638  1.1  christos 	bic	v28.2s,#0xfc,lsl#24
    639  1.1  christos 	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
    640  1.1  christos 
    641  1.1  christos 	add	v19.2d,v19.2d,v29.2d
    642  1.1  christos 	shl	v29.2d,v29.2d,#2
    643  1.1  christos 	shrn	v30.2s,v21.2d,#26
    644  1.1  christos 	xtn	v26.2s,v21.2d
    645  1.1  christos 	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
    646  1.1  christos 	bic	v25.2s,#0xfc,lsl#24
    647  1.1  christos 	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
    648  1.1  christos 	bic	v26.2s,#0xfc,lsl#24
    649  1.1  christos 
    650  1.1  christos 	shrn	v29.2s,v19.2d,#26
    651  1.1  christos 	xtn	v24.2s,v19.2d
    652  1.1  christos 	ushr	v30.2s,v27.2s,#26
    653  1.1  christos 	bic	v27.2s,#0xfc,lsl#24
    654  1.1  christos 	bic	v24.2s,#0xfc,lsl#24
    655  1.1  christos 	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
    656  1.1  christos 	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
    657  1.1  christos 
    658  1.1  christos 	b.hi	.Loop_neon
    659  1.1  christos 
    660  1.1  christos .Lskip_loop:
    661  1.1  christos 	dup	v16.2d,v16.d[0]
    662  1.1  christos 	add	v11.2s,v11.2s,v26.2s
    663  1.1  christos 
    664  1.1  christos 	////////////////////////////////////////////////////////////////
    665  1.1  christos 	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
    666  1.1  christos 
    667  1.1  christos 	adds	x2,x2,#32
    668  1.1  christos 	b.ne	.Long_tail
    669  1.1  christos 
    670  1.1  christos 	dup	v16.2d,v11.d[0]
    671  1.1  christos 	add	v14.2s,v9.2s,v24.2s
    672  1.1  christos 	add	v17.2s,v12.2s,v27.2s
    673  1.1  christos 	add	v15.2s,v10.2s,v25.2s
    674  1.1  christos 	add	v18.2s,v13.2s,v28.2s
    675  1.1  christos 
    676  1.1  christos .Long_tail:
    677  1.1  christos 	dup	v14.2d,v14.d[0]
    678  1.1  christos 	umull2	v19.2d,v16.4s,v6.4s
    679  1.1  christos 	umull2	v22.2d,v16.4s,v1.4s
    680  1.1  christos 	umull2	v23.2d,v16.4s,v3.4s
    681  1.1  christos 	umull2	v21.2d,v16.4s,v0.4s
    682  1.1  christos 	umull2	v20.2d,v16.4s,v8.4s
    683  1.1  christos 
    684  1.1  christos 	dup	v15.2d,v15.d[0]
    685  1.1  christos 	umlal2	v19.2d,v14.4s,v0.4s
    686  1.1  christos 	umlal2	v21.2d,v14.4s,v3.4s
    687  1.1  christos 	umlal2	v22.2d,v14.4s,v5.4s
    688  1.1  christos 	umlal2	v23.2d,v14.4s,v7.4s
    689  1.1  christos 	umlal2	v20.2d,v14.4s,v1.4s
    690  1.1  christos 
    691  1.1  christos 	dup	v17.2d,v17.d[0]
    692  1.1  christos 	umlal2	v19.2d,v15.4s,v8.4s
    693  1.1  christos 	umlal2	v22.2d,v15.4s,v3.4s
    694  1.1  christos 	umlal2	v21.2d,v15.4s,v1.4s
    695  1.1  christos 	umlal2	v23.2d,v15.4s,v5.4s
    696  1.1  christos 	umlal2	v20.2d,v15.4s,v0.4s
    697  1.1  christos 
    698  1.1  christos 	dup	v18.2d,v18.d[0]
    699  1.1  christos 	umlal2	v22.2d,v17.4s,v0.4s
    700  1.1  christos 	umlal2	v23.2d,v17.4s,v1.4s
    701  1.1  christos 	umlal2	v19.2d,v17.4s,v4.4s
    702  1.1  christos 	umlal2	v20.2d,v17.4s,v6.4s
    703  1.1  christos 	umlal2	v21.2d,v17.4s,v8.4s
    704  1.1  christos 
    705  1.1  christos 	umlal2	v22.2d,v18.4s,v8.4s
    706  1.1  christos 	umlal2	v19.2d,v18.4s,v2.4s
    707  1.1  christos 	umlal2	v23.2d,v18.4s,v0.4s
    708  1.1  christos 	umlal2	v20.2d,v18.4s,v4.4s
    709  1.1  christos 	umlal2	v21.2d,v18.4s,v6.4s
    710  1.1  christos 
    711  1.1  christos 	b.eq	.Lshort_tail
    712  1.1  christos 
    713  1.1  christos 	////////////////////////////////////////////////////////////////
    714  1.1  christos 	// (hash+inp[0:1])*r^4:r^3 and accumulate
    715  1.1  christos 
    716  1.1  christos 	add	v9.2s,v9.2s,v24.2s
    717  1.1  christos 	umlal	v22.2d,v11.2s,v1.2s
    718  1.1  christos 	umlal	v19.2d,v11.2s,v6.2s
    719  1.1  christos 	umlal	v23.2d,v11.2s,v3.2s
    720  1.1  christos 	umlal	v20.2d,v11.2s,v8.2s
    721  1.1  christos 	umlal	v21.2d,v11.2s,v0.2s
    722  1.1  christos 
    723  1.1  christos 	add	v10.2s,v10.2s,v25.2s
    724  1.1  christos 	umlal	v22.2d,v9.2s,v5.2s
    725  1.1  christos 	umlal	v19.2d,v9.2s,v0.2s
    726  1.1  christos 	umlal	v23.2d,v9.2s,v7.2s
    727  1.1  christos 	umlal	v20.2d,v9.2s,v1.2s
    728  1.1  christos 	umlal	v21.2d,v9.2s,v3.2s
    729  1.1  christos 
    730  1.1  christos 	add	v12.2s,v12.2s,v27.2s
    731  1.1  christos 	umlal	v22.2d,v10.2s,v3.2s
    732  1.1  christos 	umlal	v19.2d,v10.2s,v8.2s
    733  1.1  christos 	umlal	v23.2d,v10.2s,v5.2s
    734  1.1  christos 	umlal	v20.2d,v10.2s,v0.2s
    735  1.1  christos 	umlal	v21.2d,v10.2s,v1.2s
    736  1.1  christos 
    737  1.1  christos 	add	v13.2s,v13.2s,v28.2s
    738  1.1  christos 	umlal	v22.2d,v12.2s,v0.2s
    739  1.1  christos 	umlal	v19.2d,v12.2s,v4.2s
    740  1.1  christos 	umlal	v23.2d,v12.2s,v1.2s
    741  1.1  christos 	umlal	v20.2d,v12.2s,v6.2s
    742  1.1  christos 	umlal	v21.2d,v12.2s,v8.2s
    743  1.1  christos 
    744  1.1  christos 	umlal	v22.2d,v13.2s,v8.2s
    745  1.1  christos 	umlal	v19.2d,v13.2s,v2.2s
    746  1.1  christos 	umlal	v23.2d,v13.2s,v0.2s
    747  1.1  christos 	umlal	v20.2d,v13.2s,v4.2s
    748  1.1  christos 	umlal	v21.2d,v13.2s,v6.2s
    749  1.1  christos 
    750  1.1  christos .Lshort_tail:
    751  1.1  christos 	////////////////////////////////////////////////////////////////
    752  1.1  christos 	// horizontal add
    753  1.1  christos 
    754  1.1  christos 	addp	v22.2d,v22.2d,v22.2d
    755  1.1  christos 	ldp	d8,d9,[sp,#16]		// meet ABI requirements
    756  1.1  christos 	addp	v19.2d,v19.2d,v19.2d
    757  1.1  christos 	ldp	d10,d11,[sp,#32]
    758  1.1  christos 	addp	v23.2d,v23.2d,v23.2d
    759  1.1  christos 	ldp	d12,d13,[sp,#48]
    760  1.1  christos 	addp	v20.2d,v20.2d,v20.2d
    761  1.1  christos 	ldp	d14,d15,[sp,#64]
    762  1.1  christos 	addp	v21.2d,v21.2d,v21.2d
    763  1.1  christos 
    764  1.1  christos 	////////////////////////////////////////////////////////////////
    765  1.1  christos 	// lazy reduction, but without narrowing
    766  1.1  christos 
    767  1.1  christos 	ushr	v29.2d,v22.2d,#26
    768  1.1  christos 	and	v22.16b,v22.16b,v31.16b
    769  1.1  christos 	ushr	v30.2d,v19.2d,#26
    770  1.1  christos 	and	v19.16b,v19.16b,v31.16b
    771  1.1  christos 
    772  1.1  christos 	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
    773  1.1  christos 	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
    774  1.1  christos 
    775  1.1  christos 	ushr	v29.2d,v23.2d,#26
    776  1.1  christos 	and	v23.16b,v23.16b,v31.16b
    777  1.1  christos 	ushr	v30.2d,v20.2d,#26
    778  1.1  christos 	and	v20.16b,v20.16b,v31.16b
    779  1.1  christos 	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
    780  1.1  christos 
    781  1.1  christos 	add	v19.2d,v19.2d,v29.2d
    782  1.1  christos 	shl	v29.2d,v29.2d,#2
    783  1.1  christos 	ushr	v30.2d,v21.2d,#26
    784  1.1  christos 	and	v21.16b,v21.16b,v31.16b
    785  1.1  christos 	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
    786  1.1  christos 	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
    787  1.1  christos 
    788  1.1  christos 	ushr	v29.2d,v19.2d,#26
    789  1.1  christos 	and	v19.16b,v19.16b,v31.16b
    790  1.1  christos 	ushr	v30.2d,v22.2d,#26
    791  1.1  christos 	and	v22.16b,v22.16b,v31.16b
    792  1.1  christos 	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
    793  1.1  christos 	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
    794  1.1  christos 
    795  1.1  christos 	////////////////////////////////////////////////////////////////
    796  1.1  christos 	// write the result, can be partially reduced
    797  1.1  christos 
    798  1.1  christos 	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
    799  1.1  christos 	st1	{v23.s}[0],[x0]
    800  1.1  christos 
    801  1.1  christos .Lno_data_neon:
    802  1.1  christos 	ldr	x29,[sp],#80
    803  1.1  christos .inst	0xd50323bf		// autiasp
    804  1.1  christos 	ret
    805  1.1  christos .size	poly1305_blocks_neon,.-poly1305_blocks_neon
    806  1.1  christos 
    807  1.1  christos .type	poly1305_emit_neon,%function
    808  1.1  christos .align	5
    809  1.1  christos poly1305_emit_neon:
    810  1.1  christos 	ldr	x17,[x0,#24]
    811  1.1  christos 	cbz	x17,poly1305_emit
    812  1.1  christos 
    813  1.1  christos 	ldp	w10,w11,[x0]		// load hash value base 2^26
    814  1.1  christos 	ldp	w12,w13,[x0,#8]
    815  1.1  christos 	ldr	w14,[x0,#16]
    816  1.1  christos 
    817  1.1  christos 	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
    818  1.1  christos 	lsr	x5,x12,#12
    819  1.1  christos 	adds	x4,x4,x12,lsl#52
    820  1.1  christos 	add	x5,x5,x13,lsl#14
    821  1.1  christos 	adc	x5,x5,xzr
    822  1.1  christos 	lsr	x6,x14,#24
    823  1.1  christos 	adds	x5,x5,x14,lsl#40
    824  1.1  christos 	adc	x6,x6,xzr		// can be partially reduced...
    825  1.1  christos 
    826  1.1  christos 	ldp	x10,x11,[x2]	// load nonce
    827  1.1  christos 
    828  1.1  christos 	and	x12,x6,#-4		// ... so reduce
    829  1.1  christos 	add	x12,x12,x6,lsr#2
    830  1.1  christos 	and	x6,x6,#3
    831  1.1  christos 	adds	x4,x4,x12
    832  1.1  christos 	adcs	x5,x5,xzr
    833  1.1  christos 	adc	x6,x6,xzr
    834  1.1  christos 
    835  1.1  christos 	adds	x12,x4,#5		// compare to modulus
    836  1.1  christos 	adcs	x13,x5,xzr
    837  1.1  christos 	adc	x14,x6,xzr
    838  1.1  christos 
    839  1.1  christos 	tst	x14,#-4			// see if it's carried/borrowed
    840  1.1  christos 
    841  1.1  christos 	csel	x4,x4,x12,eq
    842  1.1  christos 	csel	x5,x5,x13,eq
    843  1.1  christos 
    844  1.1  christos #ifdef	__ARMEB__
    845  1.1  christos 	ror	x10,x10,#32		// flip nonce words
    846  1.1  christos 	ror	x11,x11,#32
    847  1.1  christos #endif
    848  1.1  christos 	adds	x4,x4,x10		// accumulate nonce
    849  1.1  christos 	adc	x5,x5,x11
    850  1.1  christos #ifdef	__ARMEB__
    851  1.1  christos 	rev	x4,x4			// flip output bytes
    852  1.1  christos 	rev	x5,x5
    853  1.1  christos #endif
    854  1.1  christos 	stp	x4,x5,[x1]		// write result
    855  1.1  christos 
    856  1.1  christos 	ret
    857  1.1  christos .size	poly1305_emit_neon,.-poly1305_emit_neon
    858  1.1  christos 
    859  1.1  christos .align	5
    860  1.1  christos .Lzeros:
    861  1.1  christos .long	0,0,0,0,0,0,0,0
    862  1.1  christos .LOPENSSL_armcap_P:
    863  1.1  christos #ifdef	__ILP32__
    864  1.1  christos .long	OPENSSL_armcap_P-.
    865  1.1  christos #else
    866  1.1  christos .quad	OPENSSL_armcap_P-.
    867  1.1  christos #endif
    868  1.1  christos .byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    869  1.1  christos .align	2
    870  1.1  christos .align	2
    871