Home | History | Annotate | Line # | Download | only in aarch64
      1  1.2  christos #include "arm_arch.h"
      2  1.1  christos #ifndef	__KERNEL__
      3  1.1  christos 
      4  1.1  christos .hidden	OPENSSL_armv8_rsa_neonized
      5  1.1  christos #endif
      6  1.1  christos .text
      7  1.1  christos 
      8  1.1  christos .globl	bn_mul_mont
      9  1.1  christos .type	bn_mul_mont,%function
     10  1.1  christos .align	5
     11  1.1  christos bn_mul_mont:
     12  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
     13  1.1  christos .Lbn_mul_mont:
     14  1.1  christos 	tst	x5,#3
     15  1.1  christos 	b.ne	.Lmul_mont
     16  1.1  christos 	cmp	x5,#32
     17  1.1  christos 	b.le	.Lscalar_impl
     18  1.1  christos #ifndef	__KERNEL__
     19  1.1  christos #ifndef	__AARCH64EB__
     20  1.1  christos 	adrp	x17,OPENSSL_armv8_rsa_neonized
     21  1.1  christos 	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
     22  1.1  christos 	cbnz	w17, bn_mul8x_mont_neon
     23  1.1  christos #endif
     24  1.1  christos #endif
     25  1.1  christos 
     26  1.1  christos .Lscalar_impl:
     27  1.1  christos 	tst	x5,#7
     28  1.1  christos 	b.eq	__bn_sqr8x_mont
     29  1.1  christos 	tst	x5,#3
     30  1.1  christos 	b.eq	__bn_mul4x_mont
     31  1.1  christos 
     32  1.1  christos .Lmul_mont:
     33  1.1  christos 	stp	x29,x30,[sp,#-64]!
     34  1.1  christos 	add	x29,sp,#0
     35  1.1  christos 	stp	x19,x20,[sp,#16]
     36  1.1  christos 	stp	x21,x22,[sp,#32]
     37  1.1  christos 	stp	x23,x24,[sp,#48]
     38  1.1  christos 
     39  1.1  christos 	ldr	x9,[x2],#8		// bp[0]
     40  1.1  christos 	sub	x22,sp,x5,lsl#3
     41  1.1  christos 	ldp	x7,x8,[x1],#16	// ap[0..1]
     42  1.1  christos 	lsl	x5,x5,#3
     43  1.1  christos 	ldr	x4,[x4]		// *n0
     44  1.1  christos 	and	x22,x22,#-16		// ABI says so
     45  1.1  christos 	ldp	x13,x14,[x3],#16	// np[0..1]
     46  1.1  christos 
     47  1.1  christos 	mul	x6,x7,x9		// ap[0]*bp[0]
     48  1.1  christos 	sub	x21,x5,#16		// j=num-2
     49  1.1  christos 	umulh	x7,x7,x9
     50  1.1  christos 	mul	x10,x8,x9		// ap[1]*bp[0]
     51  1.1  christos 	umulh	x11,x8,x9
     52  1.1  christos 
     53  1.1  christos 	mul	x15,x6,x4		// "tp[0]"*n0
     54  1.1  christos 	mov	sp,x22			// alloca
     55  1.1  christos 
     56  1.1  christos 	// (*)	mul	x12,x13,x15	// np[0]*m1
     57  1.1  christos 	umulh	x13,x13,x15
     58  1.1  christos 	mul	x16,x14,x15		// np[1]*m1
     59  1.1  christos 	// (*)	adds	x12,x12,x6	// discarded
     60  1.1  christos 	// (*)	As for removal of first multiplication and addition
     61  1.1  christos 	//	instructions. The outcome of first addition is
     62  1.1  christos 	//	guaranteed to be zero, which leaves two computationally
     63  1.1  christos 	//	significant outcomes: it either carries or not. Then
     64  1.1  christos 	//	question is when does it carry? Is there alternative
     65  1.1  christos 	//	way to deduce it? If you follow operations, you can
     66  1.1  christos 	//	observe that condition for carry is quite simple:
     67  1.1  christos 	//	x6 being non-zero. So that carry can be calculated
     68  1.1  christos 	//	by adding -1 to x6. That's what next instruction does.
     69  1.1  christos 	subs	xzr,x6,#1		// (*)
     70  1.1  christos 	umulh	x17,x14,x15
     71  1.1  christos 	adc	x13,x13,xzr
     72  1.1  christos 	cbz	x21,.L1st_skip
     73  1.1  christos 
     74  1.1  christos .L1st:
     75  1.1  christos 	ldr	x8,[x1],#8
     76  1.1  christos 	adds	x6,x10,x7
     77  1.1  christos 	sub	x21,x21,#8		// j--
     78  1.1  christos 	adc	x7,x11,xzr
     79  1.1  christos 
     80  1.1  christos 	ldr	x14,[x3],#8
     81  1.1  christos 	adds	x12,x16,x13
     82  1.1  christos 	mul	x10,x8,x9		// ap[j]*bp[0]
     83  1.1  christos 	adc	x13,x17,xzr
     84  1.1  christos 	umulh	x11,x8,x9
     85  1.1  christos 
     86  1.1  christos 	adds	x12,x12,x6
     87  1.1  christos 	mul	x16,x14,x15		// np[j]*m1
     88  1.1  christos 	adc	x13,x13,xzr
     89  1.1  christos 	umulh	x17,x14,x15
     90  1.1  christos 	str	x12,[x22],#8		// tp[j-1]
     91  1.1  christos 	cbnz	x21,.L1st
     92  1.1  christos 
     93  1.1  christos .L1st_skip:
     94  1.1  christos 	adds	x6,x10,x7
     95  1.1  christos 	sub	x1,x1,x5		// rewind x1
     96  1.1  christos 	adc	x7,x11,xzr
     97  1.1  christos 
     98  1.1  christos 	adds	x12,x16,x13
     99  1.1  christos 	sub	x3,x3,x5		// rewind x3
    100  1.1  christos 	adc	x13,x17,xzr
    101  1.1  christos 
    102  1.1  christos 	adds	x12,x12,x6
    103  1.1  christos 	sub	x20,x5,#8		// i=num-1
    104  1.1  christos 	adcs	x13,x13,x7
    105  1.1  christos 
    106  1.1  christos 	adc	x19,xzr,xzr		// upmost overflow bit
    107  1.1  christos 	stp	x12,x13,[x22]
    108  1.1  christos 
    109  1.1  christos .Louter:
    110  1.1  christos 	ldr	x9,[x2],#8		// bp[i]
    111  1.1  christos 	ldp	x7,x8,[x1],#16
    112  1.1  christos 	ldr	x23,[sp]		// tp[0]
    113  1.1  christos 	add	x22,sp,#8
    114  1.1  christos 
    115  1.1  christos 	mul	x6,x7,x9		// ap[0]*bp[i]
    116  1.1  christos 	sub	x21,x5,#16		// j=num-2
    117  1.1  christos 	umulh	x7,x7,x9
    118  1.1  christos 	ldp	x13,x14,[x3],#16
    119  1.1  christos 	mul	x10,x8,x9		// ap[1]*bp[i]
    120  1.1  christos 	adds	x6,x6,x23
    121  1.1  christos 	umulh	x11,x8,x9
    122  1.1  christos 	adc	x7,x7,xzr
    123  1.1  christos 
    124  1.1  christos 	mul	x15,x6,x4
    125  1.1  christos 	sub	x20,x20,#8		// i--
    126  1.1  christos 
    127  1.1  christos 	// (*)	mul	x12,x13,x15	// np[0]*m1
    128  1.1  christos 	umulh	x13,x13,x15
    129  1.1  christos 	mul	x16,x14,x15		// np[1]*m1
    130  1.1  christos 	// (*)	adds	x12,x12,x6
    131  1.1  christos 	subs	xzr,x6,#1		// (*)
    132  1.1  christos 	umulh	x17,x14,x15
    133  1.1  christos 	cbz	x21,.Linner_skip
    134  1.1  christos 
    135  1.1  christos .Linner:
    136  1.1  christos 	ldr	x8,[x1],#8
    137  1.1  christos 	adc	x13,x13,xzr
    138  1.1  christos 	ldr	x23,[x22],#8		// tp[j]
    139  1.1  christos 	adds	x6,x10,x7
    140  1.1  christos 	sub	x21,x21,#8		// j--
    141  1.1  christos 	adc	x7,x11,xzr
    142  1.1  christos 
    143  1.1  christos 	adds	x12,x16,x13
    144  1.1  christos 	ldr	x14,[x3],#8
    145  1.1  christos 	adc	x13,x17,xzr
    146  1.1  christos 
    147  1.1  christos 	mul	x10,x8,x9		// ap[j]*bp[i]
    148  1.1  christos 	adds	x6,x6,x23
    149  1.1  christos 	umulh	x11,x8,x9
    150  1.1  christos 	adc	x7,x7,xzr
    151  1.1  christos 
    152  1.1  christos 	mul	x16,x14,x15		// np[j]*m1
    153  1.1  christos 	adds	x12,x12,x6
    154  1.1  christos 	umulh	x17,x14,x15
    155  1.1  christos 	stur	x12,[x22,#-16]		// tp[j-1]
    156  1.1  christos 	cbnz	x21,.Linner
    157  1.1  christos 
    158  1.1  christos .Linner_skip:
    159  1.1  christos 	ldr	x23,[x22],#8		// tp[j]
    160  1.1  christos 	adc	x13,x13,xzr
    161  1.1  christos 	adds	x6,x10,x7
    162  1.1  christos 	sub	x1,x1,x5		// rewind x1
    163  1.1  christos 	adc	x7,x11,xzr
    164  1.1  christos 
    165  1.1  christos 	adds	x12,x16,x13
    166  1.1  christos 	sub	x3,x3,x5		// rewind x3
    167  1.1  christos 	adcs	x13,x17,x19
    168  1.1  christos 	adc	x19,xzr,xzr
    169  1.1  christos 
    170  1.1  christos 	adds	x6,x6,x23
    171  1.1  christos 	adc	x7,x7,xzr
    172  1.1  christos 
    173  1.1  christos 	adds	x12,x12,x6
    174  1.1  christos 	adcs	x13,x13,x7
    175  1.1  christos 	adc	x19,x19,xzr		// upmost overflow bit
    176  1.1  christos 	stp	x12,x13,[x22,#-16]
    177  1.1  christos 
    178  1.1  christos 	cbnz	x20,.Louter
    179  1.1  christos 
    180  1.1  christos 	// Final step. We see if result is larger than modulus, and
    181  1.1  christos 	// if it is, subtract the modulus. But comparison implies
    182  1.1  christos 	// subtraction. So we subtract modulus, see if it borrowed,
    183  1.1  christos 	// and conditionally copy original value.
    184  1.1  christos 	ldr	x23,[sp]		// tp[0]
    185  1.1  christos 	add	x22,sp,#8
    186  1.1  christos 	ldr	x14,[x3],#8		// np[0]
    187  1.1  christos 	subs	x21,x5,#8		// j=num-1 and clear borrow
    188  1.1  christos 	mov	x1,x0
    189  1.1  christos .Lsub:
    190  1.1  christos 	sbcs	x8,x23,x14		// tp[j]-np[j]
    191  1.1  christos 	ldr	x23,[x22],#8
    192  1.1  christos 	sub	x21,x21,#8		// j--
    193  1.1  christos 	ldr	x14,[x3],#8
    194  1.1  christos 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
    195  1.1  christos 	cbnz	x21,.Lsub
    196  1.1  christos 
    197  1.1  christos 	sbcs	x8,x23,x14
    198  1.1  christos 	sbcs	x19,x19,xzr		// did it borrow?
    199  1.1  christos 	str	x8,[x1],#8		// rp[num-1]
    200  1.1  christos 
    201  1.1  christos 	ldr	x23,[sp]		// tp[0]
    202  1.1  christos 	add	x22,sp,#8
    203  1.1  christos 	ldr	x8,[x0],#8		// rp[0]
    204  1.1  christos 	sub	x5,x5,#8		// num--
    205  1.1  christos 	nop
    206  1.1  christos .Lcond_copy:
    207  1.1  christos 	sub	x5,x5,#8		// num--
    208  1.1  christos 	csel	x14,x23,x8,lo		// did it borrow?
    209  1.1  christos 	ldr	x23,[x22],#8
    210  1.1  christos 	ldr	x8,[x0],#8
    211  1.1  christos 	stur	xzr,[x22,#-16]		// wipe tp
    212  1.1  christos 	stur	x14,[x0,#-16]
    213  1.1  christos 	cbnz	x5,.Lcond_copy
    214  1.1  christos 
    215  1.1  christos 	csel	x14,x23,x8,lo
    216  1.1  christos 	stur	xzr,[x22,#-8]		// wipe tp
    217  1.1  christos 	stur	x14,[x0,#-8]
    218  1.1  christos 
    219  1.1  christos 	ldp	x19,x20,[x29,#16]
    220  1.1  christos 	mov	sp,x29
    221  1.1  christos 	ldp	x21,x22,[x29,#32]
    222  1.1  christos 	mov	x0,#1
    223  1.1  christos 	ldp	x23,x24,[x29,#48]
    224  1.1  christos 	ldr	x29,[sp],#64
    225  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    226  1.1  christos 	ret
    227  1.1  christos .size	bn_mul_mont,.-bn_mul_mont
    228  1.1  christos .type	bn_mul8x_mont_neon,%function
    229  1.1  christos .align	5
    230  1.1  christos bn_mul8x_mont_neon:
    231  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
    232  1.2  christos 	// only from bn_mul_mont which has already signed the return address.
    233  1.1  christos 	stp	x29,x30,[sp,#-80]!
    234  1.1  christos 	mov	x16,sp
    235  1.1  christos 	stp	d8,d9,[sp,#16]
    236  1.1  christos 	stp	d10,d11,[sp,#32]
    237  1.1  christos 	stp	d12,d13,[sp,#48]
    238  1.1  christos 	stp	d14,d15,[sp,#64]
    239  1.1  christos 	lsl	x5,x5,#1
    240  1.1  christos 	eor	v14.16b,v14.16b,v14.16b
    241  1.1  christos 
    242  1.1  christos .align	4
    243  1.1  christos .LNEON_8n:
    244  1.1  christos 	eor	v6.16b,v6.16b,v6.16b
    245  1.1  christos 	sub	x7,sp,#128
    246  1.1  christos 	eor	v7.16b,v7.16b,v7.16b
    247  1.1  christos 	sub	x7,x7,x5,lsl#4
    248  1.1  christos 	eor	v8.16b,v8.16b,v8.16b
    249  1.1  christos 	and	x7,x7,#-64
    250  1.1  christos 	eor	v9.16b,v9.16b,v9.16b
    251  1.1  christos 	mov	sp,x7		// alloca
    252  1.1  christos 	eor	v10.16b,v10.16b,v10.16b
    253  1.1  christos 	add	x7,x7,#256
    254  1.1  christos 	eor	v11.16b,v11.16b,v11.16b
    255  1.1  christos 	sub	x8,x5,#8
    256  1.1  christos 	eor	v12.16b,v12.16b,v12.16b
    257  1.1  christos 	eor	v13.16b,v13.16b,v13.16b
    258  1.1  christos 
    259  1.1  christos .LNEON_8n_init:
    260  1.1  christos 	st1	{v6.2d,v7.2d},[x7],#32
    261  1.1  christos 	subs	x8,x8,#8
    262  1.1  christos 	st1	{v8.2d,v9.2d},[x7],#32
    263  1.1  christos 	st1	{v10.2d,v11.2d},[x7],#32
    264  1.1  christos 	st1	{v12.2d,v13.2d},[x7],#32
    265  1.1  christos 	bne	.LNEON_8n_init
    266  1.1  christos 
    267  1.1  christos 	add	x6,sp,#256
    268  1.1  christos 	ld1	{v0.4s,v1.4s},[x1],#32
    269  1.1  christos 	add	x10,sp,#8
    270  1.1  christos 	ldr	s30,[x4],#4
    271  1.1  christos 	mov	x9,x5
    272  1.1  christos 	b	.LNEON_8n_outer
    273  1.1  christos 
    274  1.1  christos .align	4
    275  1.1  christos .LNEON_8n_outer:
    276  1.1  christos 	ldr	s28,[x2],#4   // *b++
    277  1.1  christos 	uxtl	v28.4s,v28.4h
    278  1.1  christos 	add	x7,sp,#128
    279  1.1  christos 	ld1	{v2.4s,v3.4s},[x3],#32
    280  1.1  christos 
    281  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[0]
    282  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[1]
    283  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[2]
    284  1.1  christos 	shl	v29.2d,v6.2d,#16
    285  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    286  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[3]
    287  1.1  christos 	add	v29.2d,v29.2d,v6.2d
    288  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[0]
    289  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    290  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[1]
    291  1.1  christos 	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
    292  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[2]
    293  1.1  christos 	uxtl	v29.4s,v29.4h
    294  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[3]
    295  1.1  christos 	ldr	s28,[x2],#4   // *b++
    296  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[0]
    297  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[1]
    298  1.1  christos 	uxtl	v28.4s,v28.4h
    299  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[2]
    300  1.1  christos 	ushr	v15.2d,v6.2d,#16
    301  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[3]
    302  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[0]
    303  1.1  christos 	ext	v6.16b,v6.16b,v6.16b,#8
    304  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    305  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[1]
    306  1.1  christos 	ushr	v6.2d,v6.2d,#16
    307  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[2]
    308  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[3]
    309  1.1  christos 	add	v16.2d,v7.2d,v6.2d
    310  1.1  christos 	ins	v7.d[0],v16.d[0]
    311  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
    312  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[0]
    313  1.1  christos 	ld1	{v6.2d},[x6],#16
    314  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[1]
    315  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[2]
    316  1.1  christos 	shl	v29.2d,v7.2d,#16
    317  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    318  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[3]
    319  1.1  christos 	add	v29.2d,v29.2d,v7.2d
    320  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[0]
    321  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    322  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[1]
    323  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
    324  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[2]
    325  1.1  christos 	uxtl	v29.4s,v29.4h
    326  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[3]
    327  1.1  christos 	ldr	s28,[x2],#4   // *b++
    328  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[0]
    329  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[1]
    330  1.1  christos 	uxtl	v28.4s,v28.4h
    331  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[2]
    332  1.1  christos 	ushr	v15.2d,v7.2d,#16
    333  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[3]
    334  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[0]
    335  1.1  christos 	ext	v7.16b,v7.16b,v7.16b,#8
    336  1.1  christos 	add	v7.2d,v7.2d,v15.2d
    337  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[1]
    338  1.1  christos 	ushr	v7.2d,v7.2d,#16
    339  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[2]
    340  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[3]
    341  1.1  christos 	add	v16.2d,v8.2d,v7.2d
    342  1.1  christos 	ins	v8.d[0],v16.d[0]
    343  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
    344  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[0]
    345  1.1  christos 	ld1	{v7.2d},[x6],#16
    346  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[1]
    347  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[2]
    348  1.1  christos 	shl	v29.2d,v8.2d,#16
    349  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    350  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[3]
    351  1.1  christos 	add	v29.2d,v29.2d,v8.2d
    352  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[0]
    353  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    354  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[1]
    355  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
    356  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[2]
    357  1.1  christos 	uxtl	v29.4s,v29.4h
    358  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[3]
    359  1.1  christos 	ldr	s28,[x2],#4   // *b++
    360  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[0]
    361  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[1]
    362  1.1  christos 	uxtl	v28.4s,v28.4h
    363  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[2]
    364  1.1  christos 	ushr	v15.2d,v8.2d,#16
    365  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[3]
    366  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[0]
    367  1.1  christos 	ext	v8.16b,v8.16b,v8.16b,#8
    368  1.1  christos 	add	v8.2d,v8.2d,v15.2d
    369  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[1]
    370  1.1  christos 	ushr	v8.2d,v8.2d,#16
    371  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[2]
    372  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[3]
    373  1.1  christos 	add	v16.2d,v9.2d,v8.2d
    374  1.1  christos 	ins	v9.d[0],v16.d[0]
    375  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
    376  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[0]
    377  1.1  christos 	ld1	{v8.2d},[x6],#16
    378  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[1]
    379  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[2]
    380  1.1  christos 	shl	v29.2d,v9.2d,#16
    381  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    382  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[3]
    383  1.1  christos 	add	v29.2d,v29.2d,v9.2d
    384  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[0]
    385  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    386  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[1]
    387  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
    388  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[2]
    389  1.1  christos 	uxtl	v29.4s,v29.4h
    390  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[3]
    391  1.1  christos 	ldr	s28,[x2],#4   // *b++
    392  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[0]
    393  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[1]
    394  1.1  christos 	uxtl	v28.4s,v28.4h
    395  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[2]
    396  1.1  christos 	ushr	v15.2d,v9.2d,#16
    397  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[3]
    398  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[0]
    399  1.1  christos 	ext	v9.16b,v9.16b,v9.16b,#8
    400  1.1  christos 	add	v9.2d,v9.2d,v15.2d
    401  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[1]
    402  1.1  christos 	ushr	v9.2d,v9.2d,#16
    403  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[2]
    404  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[3]
    405  1.1  christos 	add	v16.2d,v10.2d,v9.2d
    406  1.1  christos 	ins	v10.d[0],v16.d[0]
    407  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
    408  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[0]
    409  1.1  christos 	ld1	{v9.2d},[x6],#16
    410  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[1]
    411  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[2]
    412  1.1  christos 	shl	v29.2d,v10.2d,#16
    413  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    414  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[3]
    415  1.1  christos 	add	v29.2d,v29.2d,v10.2d
    416  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[0]
    417  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    418  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[1]
    419  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
    420  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[2]
    421  1.1  christos 	uxtl	v29.4s,v29.4h
    422  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[3]
    423  1.1  christos 	ldr	s28,[x2],#4   // *b++
    424  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[0]
    425  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[1]
    426  1.1  christos 	uxtl	v28.4s,v28.4h
    427  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[2]
    428  1.1  christos 	ushr	v15.2d,v10.2d,#16
    429  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[3]
    430  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[0]
    431  1.1  christos 	ext	v10.16b,v10.16b,v10.16b,#8
    432  1.1  christos 	add	v10.2d,v10.2d,v15.2d
    433  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[1]
    434  1.1  christos 	ushr	v10.2d,v10.2d,#16
    435  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[2]
    436  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[3]
    437  1.1  christos 	add	v16.2d,v11.2d,v10.2d
    438  1.1  christos 	ins	v11.d[0],v16.d[0]
    439  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
    440  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[0]
    441  1.1  christos 	ld1	{v10.2d},[x6],#16
    442  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[1]
    443  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[2]
    444  1.1  christos 	shl	v29.2d,v11.2d,#16
    445  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    446  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[3]
    447  1.1  christos 	add	v29.2d,v29.2d,v11.2d
    448  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[0]
    449  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    450  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[1]
    451  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
    452  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[2]
    453  1.1  christos 	uxtl	v29.4s,v29.4h
    454  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[3]
    455  1.1  christos 	ldr	s28,[x2],#4   // *b++
    456  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[0]
    457  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[1]
    458  1.1  christos 	uxtl	v28.4s,v28.4h
    459  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[2]
    460  1.1  christos 	ushr	v15.2d,v11.2d,#16
    461  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[3]
    462  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[0]
    463  1.1  christos 	ext	v11.16b,v11.16b,v11.16b,#8
    464  1.1  christos 	add	v11.2d,v11.2d,v15.2d
    465  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[1]
    466  1.1  christos 	ushr	v11.2d,v11.2d,#16
    467  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[2]
    468  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[3]
    469  1.1  christos 	add	v16.2d,v12.2d,v11.2d
    470  1.1  christos 	ins	v12.d[0],v16.d[0]
    471  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
    472  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[0]
    473  1.1  christos 	ld1	{v11.2d},[x6],#16
    474  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[1]
    475  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[2]
    476  1.1  christos 	shl	v29.2d,v12.2d,#16
    477  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    478  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[3]
    479  1.1  christos 	add	v29.2d,v29.2d,v12.2d
    480  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[0]
    481  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    482  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[1]
    483  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
    484  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[2]
    485  1.1  christos 	uxtl	v29.4s,v29.4h
    486  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[3]
    487  1.1  christos 	ldr	s28,[x2],#4   // *b++
    488  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[0]
    489  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[1]
    490  1.1  christos 	uxtl	v28.4s,v28.4h
    491  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[2]
    492  1.1  christos 	ushr	v15.2d,v12.2d,#16
    493  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[3]
    494  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[0]
    495  1.1  christos 	ext	v12.16b,v12.16b,v12.16b,#8
    496  1.1  christos 	add	v12.2d,v12.2d,v15.2d
    497  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[1]
    498  1.1  christos 	ushr	v12.2d,v12.2d,#16
    499  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[2]
    500  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[3]
    501  1.1  christos 	add	v16.2d,v13.2d,v12.2d
    502  1.1  christos 	ins	v13.d[0],v16.d[0]
    503  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
    504  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[0]
    505  1.1  christos 	ld1	{v12.2d},[x6],#16
    506  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[1]
    507  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[2]
    508  1.1  christos 	shl	v29.2d,v13.2d,#16
    509  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    510  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[3]
    511  1.1  christos 	add	v29.2d,v29.2d,v13.2d
    512  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[0]
    513  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    514  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[1]
    515  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
    516  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[2]
    517  1.1  christos 	uxtl	v29.4s,v29.4h
    518  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[3]
    519  1.1  christos 	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
    520  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[0]
    521  1.1  christos 	ld1	{v0.4s,v1.4s},[x1],#32
    522  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[1]
    523  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[2]
    524  1.1  christos 	mov	v5.16b,v13.16b
    525  1.1  christos 	ushr	v5.2d,v5.2d,#16
    526  1.1  christos 	ext	v13.16b,v13.16b,v13.16b,#8
    527  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[3]
    528  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[0]
    529  1.1  christos 	add	v13.2d,v13.2d,v5.2d
    530  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[1]
    531  1.1  christos 	ushr	v13.2d,v13.2d,#16
    532  1.1  christos 	eor	v15.16b,v15.16b,v15.16b
    533  1.1  christos 	ins	v13.d[1],v15.d[0]
    534  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[2]
    535  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[3]
    536  1.1  christos 	add	v6.2d,v6.2d,v13.2d
    537  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
    538  1.1  christos 	add	x10,sp,#8		// rewind
    539  1.1  christos 	sub	x8,x5,#8
    540  1.1  christos 	b	.LNEON_8n_inner
    541  1.1  christos 
    542  1.1  christos .align	4
    543  1.1  christos .LNEON_8n_inner:
    544  1.1  christos 	subs	x8,x8,#8
    545  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[0]
    546  1.1  christos 	ld1	{v13.2d},[x6]
    547  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[1]
    548  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
    549  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[2]
    550  1.1  christos 	ld1	{v2.4s,v3.4s},[x3],#32
    551  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[3]
    552  1.1  christos 	b.eq	.LInner_jump
    553  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    554  1.1  christos .LInner_jump:
    555  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[0]
    556  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[1]
    557  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[2]
    558  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[3]
    559  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
    560  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[0]
    561  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[1]
    562  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[2]
    563  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[3]
    564  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[0]
    565  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[1]
    566  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[2]
    567  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[3]
    568  1.1  christos 	st1	{v6.2d},[x7],#16
    569  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[0]
    570  1.1  christos 	ld1	{v6.2d},[x6]
    571  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[1]
    572  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
    573  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[2]
    574  1.1  christos 	b.eq	.LInner_jump1
    575  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    576  1.1  christos .LInner_jump1:
    577  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[3]
    578  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[0]
    579  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[1]
    580  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[2]
    581  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[3]
    582  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
    583  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[0]
    584  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[1]
    585  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[2]
    586  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[3]
    587  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[0]
    588  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[1]
    589  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[2]
    590  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[3]
    591  1.1  christos 	st1	{v7.2d},[x7],#16
    592  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[0]
    593  1.1  christos 	ld1	{v7.2d},[x6]
    594  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[1]
    595  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
    596  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[2]
    597  1.1  christos 	b.eq	.LInner_jump2
    598  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    599  1.1  christos .LInner_jump2:
    600  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[3]
    601  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[0]
    602  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[1]
    603  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[2]
    604  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[3]
    605  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
    606  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[0]
    607  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[1]
    608  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[2]
    609  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[3]
    610  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[0]
    611  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[1]
    612  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[2]
    613  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[3]
    614  1.1  christos 	st1	{v8.2d},[x7],#16
    615  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[0]
    616  1.1  christos 	ld1	{v8.2d},[x6]
    617  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[1]
    618  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
    619  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[2]
    620  1.1  christos 	b.eq	.LInner_jump3
    621  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    622  1.1  christos .LInner_jump3:
    623  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[3]
    624  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[0]
    625  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[1]
    626  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[2]
    627  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[3]
    628  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
    629  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[0]
    630  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[1]
    631  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[2]
    632  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[3]
    633  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[0]
    634  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[1]
    635  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[2]
    636  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[3]
    637  1.1  christos 	st1	{v9.2d},[x7],#16
    638  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[0]
    639  1.1  christos 	ld1	{v9.2d},[x6]
    640  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[1]
    641  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
    642  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[2]
    643  1.1  christos 	b.eq	.LInner_jump4
    644  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    645  1.1  christos .LInner_jump4:
    646  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[3]
    647  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[0]
    648  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[1]
    649  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[2]
    650  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[3]
    651  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
    652  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[0]
    653  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[1]
    654  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[2]
    655  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[3]
    656  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[0]
    657  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[1]
    658  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[2]
    659  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[3]
    660  1.1  christos 	st1	{v10.2d},[x7],#16
    661  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[0]
    662  1.1  christos 	ld1	{v10.2d},[x6]
    663  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[1]
    664  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
    665  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[2]
    666  1.1  christos 	b.eq	.LInner_jump5
    667  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    668  1.1  christos .LInner_jump5:
    669  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[3]
    670  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[0]
    671  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[1]
    672  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[2]
    673  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[3]
    674  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
    675  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[0]
    676  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[1]
    677  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[2]
    678  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[3]
    679  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[0]
    680  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[1]
    681  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[2]
    682  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[3]
    683  1.1  christos 	st1	{v11.2d},[x7],#16
    684  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[0]
    685  1.1  christos 	ld1	{v11.2d},[x6]
    686  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[1]
    687  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
    688  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[2]
    689  1.1  christos 	b.eq	.LInner_jump6
    690  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    691  1.1  christos .LInner_jump6:
    692  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[3]
    693  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[0]
    694  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[1]
    695  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[2]
    696  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[3]
    697  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
    698  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[0]
    699  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[1]
    700  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[2]
    701  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[3]
    702  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[0]
    703  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[1]
    704  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[2]
    705  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[3]
    706  1.1  christos 	st1	{v12.2d},[x7],#16
    707  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[0]
    708  1.1  christos 	ld1	{v12.2d},[x6]
    709  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[1]
    710  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
    711  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[2]
    712  1.1  christos 	b.eq	.LInner_jump7
    713  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    714  1.1  christos .LInner_jump7:
    715  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[3]
    716  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[0]
    717  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[1]
    718  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[2]
    719  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[3]
    720  1.1  christos 	b.ne	.LInner_after_rewind8
    721  1.1  christos 	sub	x1,x1,x5,lsl#2	// rewind
    722  1.1  christos .LInner_after_rewind8:
    723  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[0]
    724  1.1  christos 	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
    725  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[1]
    726  1.1  christos 	ld1	{v0.4s,v1.4s},[x1],#32
    727  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[2]
    728  1.1  christos 	add	x10,sp,#8		// rewind
    729  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[3]
    730  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[0]
    731  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[1]
    732  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[2]
    733  1.1  christos 	st1	{v13.2d},[x7],#16
    734  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[3]
    735  1.1  christos 
    736  1.1  christos 	bne	.LNEON_8n_inner
    737  1.1  christos 	add	x6,sp,#128
    738  1.1  christos 	st1	{v6.2d,v7.2d},[x7],#32
    739  1.1  christos 	eor	v2.16b,v2.16b,v2.16b	// v2
    740  1.1  christos 	st1	{v8.2d,v9.2d},[x7],#32
    741  1.1  christos 	eor	v3.16b,v3.16b,v3.16b	// v3
    742  1.1  christos 	st1	{v10.2d,v11.2d},[x7],#32
    743  1.1  christos 	st1	{v12.2d},[x7]
    744  1.1  christos 
    745  1.1  christos 	subs	x9,x9,#8
    746  1.1  christos 	ld1	{v6.2d,v7.2d},[x6],#32
    747  1.1  christos 	ld1	{v8.2d,v9.2d},[x6],#32
    748  1.1  christos 	ld1	{v10.2d,v11.2d},[x6],#32
    749  1.1  christos 	ld1	{v12.2d,v13.2d},[x6],#32
    750  1.1  christos 
    751  1.1  christos 	b.eq	.LInner_8n_jump_2steps
    752  1.1  christos 	sub	x3,x3,x5,lsl#2	// rewind
    753  1.1  christos 	b	.LNEON_8n_outer
    754  1.1  christos 
    755  1.1  christos .LInner_8n_jump_2steps:
    756  1.1  christos 	add	x7,sp,#128
    757  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
    758  1.1  christos 	mov	v5.16b,v6.16b
    759  1.1  christos 	ushr	v15.2d,v6.2d,#16
    760  1.1  christos 	ext	v6.16b,v6.16b,v6.16b,#8
    761  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32
    762  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    763  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32
    764  1.1  christos 	ushr	v15.2d,v6.2d,#16
    765  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32
    766  1.1  christos 	zip1	v6.4h,v5.4h,v6.4h
    767  1.1  christos 	ins	v15.d[1],v14.d[0]
    768  1.1  christos 
    769  1.1  christos 	mov	x8,x5
    770  1.1  christos 	b	.LNEON_tail_entry
    771  1.1  christos 
    772  1.1  christos .align	4
    773  1.1  christos .LNEON_tail:
    774  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    775  1.1  christos 	mov	v5.16b,v6.16b
    776  1.1  christos 	ushr	v15.2d,v6.2d,#16
    777  1.1  christos 	ext	v6.16b,v6.16b,v6.16b,#8
    778  1.1  christos 	ld1	{v8.2d,v9.2d}, [x6],#32
    779  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    780  1.1  christos 	ld1	{v10.2d,v11.2d}, [x6],#32
    781  1.1  christos 	ushr	v15.2d,v6.2d,#16
    782  1.1  christos 	ld1	{v12.2d,v13.2d}, [x6],#32
    783  1.1  christos 	zip1	v6.4h,v5.4h,v6.4h
    784  1.1  christos 	ins	v15.d[1],v14.d[0]
    785  1.1  christos 
    786  1.1  christos .LNEON_tail_entry:
    787  1.1  christos 	add	v7.2d,v7.2d,v15.2d
    788  1.1  christos 	st1	{v6.s}[0], [x7],#4
    789  1.1  christos 	ushr	v15.2d,v7.2d,#16
    790  1.1  christos 	mov	v5.16b,v7.16b
    791  1.1  christos 	ext	v7.16b,v7.16b,v7.16b,#8
    792  1.1  christos 	add	v7.2d,v7.2d,v15.2d
    793  1.1  christos 	ushr	v15.2d,v7.2d,#16
    794  1.1  christos 	zip1	v7.4h,v5.4h,v7.4h
    795  1.1  christos 	ins	v15.d[1],v14.d[0]
    796  1.1  christos 	add	v8.2d,v8.2d,v15.2d
    797  1.1  christos 	st1	{v7.s}[0], [x7],#4
    798  1.1  christos 	ushr	v15.2d,v8.2d,#16
    799  1.1  christos 	mov	v5.16b,v8.16b
    800  1.1  christos 	ext	v8.16b,v8.16b,v8.16b,#8
    801  1.1  christos 	add	v8.2d,v8.2d,v15.2d
    802  1.1  christos 	ushr	v15.2d,v8.2d,#16
    803  1.1  christos 	zip1	v8.4h,v5.4h,v8.4h
    804  1.1  christos 	ins	v15.d[1],v14.d[0]
    805  1.1  christos 	add	v9.2d,v9.2d,v15.2d
    806  1.1  christos 	st1	{v8.s}[0], [x7],#4
    807  1.1  christos 	ushr	v15.2d,v9.2d,#16
    808  1.1  christos 	mov	v5.16b,v9.16b
    809  1.1  christos 	ext	v9.16b,v9.16b,v9.16b,#8
    810  1.1  christos 	add	v9.2d,v9.2d,v15.2d
    811  1.1  christos 	ushr	v15.2d,v9.2d,#16
    812  1.1  christos 	zip1	v9.4h,v5.4h,v9.4h
    813  1.1  christos 	ins	v15.d[1],v14.d[0]
    814  1.1  christos 	add	v10.2d,v10.2d,v15.2d
    815  1.1  christos 	st1	{v9.s}[0], [x7],#4
    816  1.1  christos 	ushr	v15.2d,v10.2d,#16
    817  1.1  christos 	mov	v5.16b,v10.16b
    818  1.1  christos 	ext	v10.16b,v10.16b,v10.16b,#8
    819  1.1  christos 	add	v10.2d,v10.2d,v15.2d
    820  1.1  christos 	ushr	v15.2d,v10.2d,#16
    821  1.1  christos 	zip1	v10.4h,v5.4h,v10.4h
    822  1.1  christos 	ins	v15.d[1],v14.d[0]
    823  1.1  christos 	add	v11.2d,v11.2d,v15.2d
    824  1.1  christos 	st1	{v10.s}[0], [x7],#4
    825  1.1  christos 	ushr	v15.2d,v11.2d,#16
    826  1.1  christos 	mov	v5.16b,v11.16b
    827  1.1  christos 	ext	v11.16b,v11.16b,v11.16b,#8
    828  1.1  christos 	add	v11.2d,v11.2d,v15.2d
    829  1.1  christos 	ushr	v15.2d,v11.2d,#16
    830  1.1  christos 	zip1	v11.4h,v5.4h,v11.4h
    831  1.1  christos 	ins	v15.d[1],v14.d[0]
    832  1.1  christos 	add	v12.2d,v12.2d,v15.2d
    833  1.1  christos 	st1	{v11.s}[0], [x7],#4
    834  1.1  christos 	ushr	v15.2d,v12.2d,#16
    835  1.1  christos 	mov	v5.16b,v12.16b
    836  1.1  christos 	ext	v12.16b,v12.16b,v12.16b,#8
    837  1.1  christos 	add	v12.2d,v12.2d,v15.2d
    838  1.1  christos 	ushr	v15.2d,v12.2d,#16
    839  1.1  christos 	zip1	v12.4h,v5.4h,v12.4h
    840  1.1  christos 	ins	v15.d[1],v14.d[0]
    841  1.1  christos 	add	v13.2d,v13.2d,v15.2d
    842  1.1  christos 	st1	{v12.s}[0], [x7],#4
    843  1.1  christos 	ushr	v15.2d,v13.2d,#16
    844  1.1  christos 	mov	v5.16b,v13.16b
    845  1.1  christos 	ext	v13.16b,v13.16b,v13.16b,#8
    846  1.1  christos 	add	v13.2d,v13.2d,v15.2d
    847  1.1  christos 	ushr	v15.2d,v13.2d,#16
    848  1.1  christos 	zip1	v13.4h,v5.4h,v13.4h
    849  1.1  christos 	ins	v15.d[1],v14.d[0]
    850  1.1  christos 	ld1	{v6.2d,v7.2d}, [x6],#32
    851  1.1  christos 	subs	x8,x8,#8
    852  1.1  christos 	st1	{v13.s}[0], [x7],#4
    853  1.1  christos 	bne	.LNEON_tail
    854  1.1  christos 
    855  1.1  christos 	st1	{v15.s}[0], [x7],#4	// top-most bit
    856  1.1  christos 	sub	x3,x3,x5,lsl#2		// rewind x3
    857  1.1  christos 	subs	x1,sp,#0			// clear carry flag
    858  1.1  christos 	add	x2,sp,x5,lsl#2
    859  1.1  christos 
    860  1.1  christos .LNEON_sub:
    861  1.1  christos 	ldp	w4,w5,[x1],#8
    862  1.1  christos 	ldp	w6,w7,[x1],#8
    863  1.1  christos 	ldp	w8,w9,[x3],#8
    864  1.1  christos 	ldp	w10,w11,[x3],#8
    865  1.1  christos 	sbcs	w8,w4,w8
    866  1.1  christos 	sbcs	w9,w5,w9
    867  1.1  christos 	sbcs	w10,w6,w10
    868  1.1  christos 	sbcs	w11,w7,w11
    869  1.1  christos 	sub	x17,x2,x1
    870  1.1  christos 	stp	w8,w9,[x0],#8
    871  1.1  christos 	stp	w10,w11,[x0],#8
    872  1.1  christos 	cbnz	x17,.LNEON_sub
    873  1.1  christos 
    874  1.1  christos 	ldr	w10, [x1]		// load top-most bit
    875  1.1  christos 	mov	x11,sp
    876  1.1  christos 	eor	v0.16b,v0.16b,v0.16b
    877  1.1  christos 	sub	x11,x2,x11		// this is num*4
    878  1.1  christos 	eor	v1.16b,v1.16b,v1.16b
    879  1.1  christos 	mov	x1,sp
    880  1.1  christos 	sub	x0,x0,x11		// rewind x0
    881  1.1  christos 	mov	x3,x2		// second 3/4th of frame
    882  1.1  christos 	sbcs	w10,w10,wzr		// result is carry flag
    883  1.1  christos 
    884  1.1  christos .LNEON_copy_n_zap:
    885  1.1  christos 	ldp	w4,w5,[x1],#8
    886  1.1  christos 	ldp	w6,w7,[x1],#8
    887  1.1  christos 	ldp	w8,w9,[x0],#8
    888  1.1  christos 	ldp	w10,w11,[x0]
    889  1.1  christos 	sub	x0,x0,#8
    890  1.1  christos 	b.cs	.LCopy_1
    891  1.1  christos 	mov	w8,w4
    892  1.1  christos 	mov	w9,w5
    893  1.1  christos 	mov	w10,w6
    894  1.1  christos 	mov	w11,w7
    895  1.1  christos .LCopy_1:
    896  1.1  christos 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
    897  1.1  christos 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
    898  1.1  christos 	ldp	w4,w5,[x1],#8
    899  1.1  christos 	ldp	w6,w7,[x1],#8
    900  1.1  christos 	stp	w8,w9,[x0],#8
    901  1.1  christos 	stp	w10,w11,[x0],#8
    902  1.1  christos 	sub	x1,x1,#32
    903  1.1  christos 	ldp	w8,w9,[x0],#8
    904  1.1  christos 	ldp	w10,w11,[x0]
    905  1.1  christos 	sub	x0,x0,#8
    906  1.1  christos 	b.cs	.LCopy_2
    907  1.1  christos 	mov	w8, w4
    908  1.1  christos 	mov	w9, w5
    909  1.1  christos 	mov	w10, w6
    910  1.1  christos 	mov	w11, w7
    911  1.1  christos .LCopy_2:
    912  1.1  christos 	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
    913  1.1  christos 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
    914  1.1  christos 	sub	x17,x2,x1		// preserves carry
    915  1.1  christos 	stp	w8,w9,[x0],#8
    916  1.1  christos 	stp	w10,w11,[x0],#8
    917  1.1  christos 	cbnz	x17,.LNEON_copy_n_zap
    918  1.1  christos 
    919  1.1  christos 	mov	sp,x16
    920  1.1  christos 	ldp	d14,d15,[sp,#64]
    921  1.1  christos 	ldp	d12,d13,[sp,#48]
    922  1.1  christos 	ldp	d10,d11,[sp,#32]
    923  1.1  christos 	ldp	d8,d9,[sp,#16]
    924  1.1  christos 	ldr	x29,[sp],#80
    925  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    926  1.1  christos 	ret	// bx lr
    927  1.1  christos 
    928  1.1  christos .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
    929  1.1  christos .type	__bn_sqr8x_mont,%function
    930  1.1  christos .align	5
    931  1.1  christos __bn_sqr8x_mont:
    932  1.1  christos 	cmp	x1,x2
    933  1.1  christos 	b.ne	__bn_mul4x_mont
    934  1.1  christos .Lsqr8x_mont:
    935  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
    936  1.2  christos 	// only from bn_mul_mont which has already signed the return address.
    937  1.1  christos 	stp	x29,x30,[sp,#-128]!
    938  1.1  christos 	add	x29,sp,#0
    939  1.1  christos 	stp	x19,x20,[sp,#16]
    940  1.1  christos 	stp	x21,x22,[sp,#32]
    941  1.1  christos 	stp	x23,x24,[sp,#48]
    942  1.1  christos 	stp	x25,x26,[sp,#64]
    943  1.1  christos 	stp	x27,x28,[sp,#80]
    944  1.1  christos 	stp	x0,x3,[sp,#96]	// offload rp and np
    945  1.1  christos 
    946  1.1  christos 	ldp	x6,x7,[x1,#8*0]
    947  1.1  christos 	ldp	x8,x9,[x1,#8*2]
    948  1.1  christos 	ldp	x10,x11,[x1,#8*4]
    949  1.1  christos 	ldp	x12,x13,[x1,#8*6]
    950  1.1  christos 
    951  1.1  christos 	sub	x2,sp,x5,lsl#4
    952  1.1  christos 	lsl	x5,x5,#3
    953  1.1  christos 	ldr	x4,[x4]		// *n0
    954  1.1  christos 	mov	sp,x2			// alloca
    955  1.1  christos 	sub	x27,x5,#8*8
    956  1.1  christos 	b	.Lsqr8x_zero_start
    957  1.1  christos 
    958  1.1  christos .Lsqr8x_zero:
    959  1.1  christos 	sub	x27,x27,#8*8
    960  1.1  christos 	stp	xzr,xzr,[x2,#8*0]
    961  1.1  christos 	stp	xzr,xzr,[x2,#8*2]
    962  1.1  christos 	stp	xzr,xzr,[x2,#8*4]
    963  1.1  christos 	stp	xzr,xzr,[x2,#8*6]
    964  1.1  christos .Lsqr8x_zero_start:
    965  1.1  christos 	stp	xzr,xzr,[x2,#8*8]
    966  1.1  christos 	stp	xzr,xzr,[x2,#8*10]
    967  1.1  christos 	stp	xzr,xzr,[x2,#8*12]
    968  1.1  christos 	stp	xzr,xzr,[x2,#8*14]
    969  1.1  christos 	add	x2,x2,#8*16
    970  1.1  christos 	cbnz	x27,.Lsqr8x_zero
    971  1.1  christos 
    972  1.1  christos 	add	x3,x1,x5
    973  1.1  christos 	add	x1,x1,#8*8
    974  1.1  christos 	mov	x19,xzr
    975  1.1  christos 	mov	x20,xzr
    976  1.1  christos 	mov	x21,xzr
    977  1.1  christos 	mov	x22,xzr
    978  1.1  christos 	mov	x23,xzr
    979  1.1  christos 	mov	x24,xzr
    980  1.1  christos 	mov	x25,xzr
    981  1.1  christos 	mov	x26,xzr
    982  1.1  christos 	mov	x2,sp
    983  1.1  christos 	str	x4,[x29,#112]		// offload n0
    984  1.1  christos 
    985  1.1  christos 	// Multiply everything but a[i]*a[i]
    986  1.1  christos .align	4
    987  1.1  christos .Lsqr8x_outer_loop:
    988  1.1  christos         //                                                 a[1]a[0]	(i)
    989  1.1  christos         //                                             a[2]a[0]
    990  1.1  christos         //                                         a[3]a[0]
    991  1.1  christos         //                                     a[4]a[0]
    992  1.1  christos         //                                 a[5]a[0]
    993  1.1  christos         //                             a[6]a[0]
    994  1.1  christos         //                         a[7]a[0]
    995  1.1  christos         //                                         a[2]a[1]		(ii)
    996  1.1  christos         //                                     a[3]a[1]
    997  1.1  christos         //                                 a[4]a[1]
    998  1.1  christos         //                             a[5]a[1]
    999  1.1  christos         //                         a[6]a[1]
   1000  1.1  christos         //                     a[7]a[1]
   1001  1.1  christos         //                                 a[3]a[2]			(iii)
   1002  1.1  christos         //                             a[4]a[2]
   1003  1.1  christos         //                         a[5]a[2]
   1004  1.1  christos         //                     a[6]a[2]
   1005  1.1  christos         //                 a[7]a[2]
   1006  1.1  christos         //                         a[4]a[3]				(iv)
   1007  1.1  christos         //                     a[5]a[3]
   1008  1.1  christos         //                 a[6]a[3]
   1009  1.1  christos         //             a[7]a[3]
   1010  1.1  christos         //                 a[5]a[4]					(v)
   1011  1.1  christos         //             a[6]a[4]
   1012  1.1  christos         //         a[7]a[4]
   1013  1.1  christos         //         a[6]a[5]						(vi)
   1014  1.1  christos         //     a[7]a[5]
   1015  1.1  christos         // a[7]a[6]							(vii)
   1016  1.1  christos 
   1017  1.1  christos 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
   1018  1.1  christos 	mul	x15,x8,x6
   1019  1.1  christos 	mul	x16,x9,x6
   1020  1.1  christos 	mul	x17,x10,x6
   1021  1.1  christos 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
   1022  1.1  christos 	mul	x14,x11,x6
   1023  1.1  christos 	adcs	x21,x21,x15
   1024  1.1  christos 	mul	x15,x12,x6
   1025  1.1  christos 	adcs	x22,x22,x16
   1026  1.1  christos 	mul	x16,x13,x6
   1027  1.1  christos 	adcs	x23,x23,x17
   1028  1.1  christos 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
   1029  1.1  christos 	adcs	x24,x24,x14
   1030  1.1  christos 	umulh	x14,x8,x6
   1031  1.1  christos 	adcs	x25,x25,x15
   1032  1.1  christos 	umulh	x15,x9,x6
   1033  1.1  christos 	adcs	x26,x26,x16
   1034  1.1  christos 	umulh	x16,x10,x6
   1035  1.1  christos 	stp	x19,x20,[x2],#8*2	// t[0..1]
   1036  1.1  christos 	adc	x19,xzr,xzr		// t[8]
   1037  1.1  christos 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
   1038  1.1  christos 	umulh	x17,x11,x6
   1039  1.1  christos 	adcs	x22,x22,x14
   1040  1.1  christos 	umulh	x14,x12,x6
   1041  1.1  christos 	adcs	x23,x23,x15
   1042  1.1  christos 	umulh	x15,x13,x6
   1043  1.1  christos 	adcs	x24,x24,x16
   1044  1.1  christos 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
   1045  1.1  christos 	adcs	x25,x25,x17
   1046  1.1  christos 	mul	x17,x9,x7
   1047  1.1  christos 	adcs	x26,x26,x14
   1048  1.1  christos 	mul	x14,x10,x7
   1049  1.1  christos 	adc	x19,x19,x15
   1050  1.1  christos 
   1051  1.1  christos 	mul	x15,x11,x7
   1052  1.1  christos 	adds	x22,x22,x16
   1053  1.1  christos 	mul	x16,x12,x7
   1054  1.1  christos 	adcs	x23,x23,x17
   1055  1.1  christos 	mul	x17,x13,x7
   1056  1.1  christos 	adcs	x24,x24,x14
   1057  1.1  christos 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
   1058  1.1  christos 	adcs	x25,x25,x15
   1059  1.1  christos 	umulh	x15,x9,x7
   1060  1.1  christos 	adcs	x26,x26,x16
   1061  1.1  christos 	umulh	x16,x10,x7
   1062  1.1  christos 	adcs	x19,x19,x17
   1063  1.1  christos 	umulh	x17,x11,x7
   1064  1.1  christos 	stp	x21,x22,[x2],#8*2	// t[2..3]
   1065  1.1  christos 	adc	x20,xzr,xzr		// t[9]
   1066  1.1  christos 	adds	x23,x23,x14
   1067  1.1  christos 	umulh	x14,x12,x7
   1068  1.1  christos 	adcs	x24,x24,x15
   1069  1.1  christos 	umulh	x15,x13,x7
   1070  1.1  christos 	adcs	x25,x25,x16
   1071  1.1  christos 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
   1072  1.1  christos 	adcs	x26,x26,x17
   1073  1.1  christos 	mul	x17,x10,x8
   1074  1.1  christos 	adcs	x19,x19,x14
   1075  1.1  christos 	mul	x14,x11,x8
   1076  1.1  christos 	adc	x20,x20,x15
   1077  1.1  christos 
   1078  1.1  christos 	mul	x15,x12,x8
   1079  1.1  christos 	adds	x24,x24,x16
   1080  1.1  christos 	mul	x16,x13,x8
   1081  1.1  christos 	adcs	x25,x25,x17
   1082  1.1  christos 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
   1083  1.1  christos 	adcs	x26,x26,x14
   1084  1.1  christos 	umulh	x14,x10,x8
   1085  1.1  christos 	adcs	x19,x19,x15
   1086  1.1  christos 	umulh	x15,x11,x8
   1087  1.1  christos 	adcs	x20,x20,x16
   1088  1.1  christos 	umulh	x16,x12,x8
   1089  1.1  christos 	stp	x23,x24,[x2],#8*2	// t[4..5]
   1090  1.1  christos 	adc	x21,xzr,xzr		// t[10]
   1091  1.1  christos 	adds	x25,x25,x17
   1092  1.1  christos 	umulh	x17,x13,x8
   1093  1.1  christos 	adcs	x26,x26,x14
   1094  1.1  christos 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
   1095  1.1  christos 	adcs	x19,x19,x15
   1096  1.1  christos 	mul	x15,x11,x9
   1097  1.1  christos 	adcs	x20,x20,x16
   1098  1.1  christos 	mul	x16,x12,x9
   1099  1.1  christos 	adc	x21,x21,x17
   1100  1.1  christos 
   1101  1.1  christos 	mul	x17,x13,x9
   1102  1.1  christos 	adds	x26,x26,x14
   1103  1.1  christos 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
   1104  1.1  christos 	adcs	x19,x19,x15
   1105  1.1  christos 	umulh	x15,x11,x9
   1106  1.1  christos 	adcs	x20,x20,x16
   1107  1.1  christos 	umulh	x16,x12,x9
   1108  1.1  christos 	adcs	x21,x21,x17
   1109  1.1  christos 	umulh	x17,x13,x9
   1110  1.1  christos 	stp	x25,x26,[x2],#8*2	// t[6..7]
   1111  1.1  christos 	adc	x22,xzr,xzr		// t[11]
   1112  1.1  christos 	adds	x19,x19,x14
   1113  1.1  christos 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
   1114  1.1  christos 	adcs	x20,x20,x15
   1115  1.1  christos 	mul	x15,x12,x10
   1116  1.1  christos 	adcs	x21,x21,x16
   1117  1.1  christos 	mul	x16,x13,x10
   1118  1.1  christos 	adc	x22,x22,x17
   1119  1.1  christos 
   1120  1.1  christos 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
   1121  1.1  christos 	adds	x20,x20,x14
   1122  1.1  christos 	umulh	x14,x12,x10
   1123  1.1  christos 	adcs	x21,x21,x15
   1124  1.1  christos 	umulh	x15,x13,x10
   1125  1.1  christos 	adcs	x22,x22,x16
   1126  1.1  christos 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
   1127  1.1  christos 	adc	x23,xzr,xzr		// t[12]
   1128  1.1  christos 	adds	x21,x21,x17
   1129  1.1  christos 	mul	x17,x13,x11
   1130  1.1  christos 	adcs	x22,x22,x14
   1131  1.1  christos 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
   1132  1.1  christos 	adc	x23,x23,x15
   1133  1.1  christos 
   1134  1.1  christos 	umulh	x15,x13,x11
   1135  1.1  christos 	adds	x22,x22,x16
   1136  1.1  christos 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
   1137  1.1  christos 	adcs	x23,x23,x17
   1138  1.1  christos 	umulh	x17,x13,x12		// hi(a[7]*a[6])
   1139  1.1  christos 	adc	x24,xzr,xzr		// t[13]
   1140  1.1  christos 	adds	x23,x23,x14
   1141  1.1  christos 	sub	x27,x3,x1	// done yet?
   1142  1.1  christos 	adc	x24,x24,x15
   1143  1.1  christos 
   1144  1.1  christos 	adds	x24,x24,x16
   1145  1.1  christos 	sub	x14,x3,x5	// rewinded ap
   1146  1.1  christos 	adc	x25,xzr,xzr		// t[14]
   1147  1.1  christos 	add	x25,x25,x17
   1148  1.1  christos 
   1149  1.1  christos 	cbz	x27,.Lsqr8x_outer_break
   1150  1.1  christos 
   1151  1.1  christos 	mov	x4,x6
   1152  1.1  christos 	ldp	x6,x7,[x2,#8*0]
   1153  1.1  christos 	ldp	x8,x9,[x2,#8*2]
   1154  1.1  christos 	ldp	x10,x11,[x2,#8*4]
   1155  1.1  christos 	ldp	x12,x13,[x2,#8*6]
   1156  1.1  christos 	adds	x19,x19,x6
   1157  1.1  christos 	adcs	x20,x20,x7
   1158  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1159  1.1  christos 	adcs	x21,x21,x8
   1160  1.1  christos 	adcs	x22,x22,x9
   1161  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1162  1.1  christos 	adcs	x23,x23,x10
   1163  1.1  christos 	adcs	x24,x24,x11
   1164  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1165  1.1  christos 	adcs	x25,x25,x12
   1166  1.1  christos 	mov	x0,x1
   1167  1.1  christos 	adcs	x26,xzr,x13
   1168  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1169  1.1  christos 	add	x1,x1,#8*8
   1170  1.1  christos 	//adc	x28,xzr,xzr		// moved below
   1171  1.1  christos 	mov	x27,#-8*8
   1172  1.1  christos 
   1173  1.1  christos 	//                                                         a[8]a[0]
   1174  1.1  christos 	//                                                     a[9]a[0]
   1175  1.1  christos 	//                                                 a[a]a[0]
   1176  1.1  christos 	//                                             a[b]a[0]
   1177  1.1  christos 	//                                         a[c]a[0]
   1178  1.1  christos 	//                                     a[d]a[0]
   1179  1.1  christos 	//                                 a[e]a[0]
   1180  1.1  christos 	//                             a[f]a[0]
   1181  1.1  christos 	//                                                     a[8]a[1]
   1182  1.1  christos 	//                         a[f]a[1]........................
   1183  1.1  christos 	//                                                 a[8]a[2]
   1184  1.1  christos 	//                     a[f]a[2]........................
   1185  1.1  christos 	//                                             a[8]a[3]
   1186  1.1  christos 	//                 a[f]a[3]........................
   1187  1.1  christos 	//                                         a[8]a[4]
   1188  1.1  christos 	//             a[f]a[4]........................
   1189  1.1  christos 	//                                     a[8]a[5]
   1190  1.1  christos 	//         a[f]a[5]........................
   1191  1.1  christos 	//                                 a[8]a[6]
   1192  1.1  christos 	//     a[f]a[6]........................
   1193  1.1  christos 	//                             a[8]a[7]
   1194  1.1  christos 	// a[f]a[7]........................
   1195  1.1  christos .Lsqr8x_mul:
   1196  1.1  christos 	mul	x14,x6,x4
   1197  1.1  christos 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
   1198  1.1  christos 	mul	x15,x7,x4
   1199  1.1  christos 	add	x27,x27,#8
   1200  1.1  christos 	mul	x16,x8,x4
   1201  1.1  christos 	mul	x17,x9,x4
   1202  1.1  christos 	adds	x19,x19,x14
   1203  1.1  christos 	mul	x14,x10,x4
   1204  1.1  christos 	adcs	x20,x20,x15
   1205  1.1  christos 	mul	x15,x11,x4
   1206  1.1  christos 	adcs	x21,x21,x16
   1207  1.1  christos 	mul	x16,x12,x4
   1208  1.1  christos 	adcs	x22,x22,x17
   1209  1.1  christos 	mul	x17,x13,x4
   1210  1.1  christos 	adcs	x23,x23,x14
   1211  1.1  christos 	umulh	x14,x6,x4
   1212  1.1  christos 	adcs	x24,x24,x15
   1213  1.1  christos 	umulh	x15,x7,x4
   1214  1.1  christos 	adcs	x25,x25,x16
   1215  1.1  christos 	umulh	x16,x8,x4
   1216  1.1  christos 	adcs	x26,x26,x17
   1217  1.1  christos 	umulh	x17,x9,x4
   1218  1.1  christos 	adc	x28,x28,xzr
   1219  1.1  christos 	str	x19,[x2],#8
   1220  1.1  christos 	adds	x19,x20,x14
   1221  1.1  christos 	umulh	x14,x10,x4
   1222  1.1  christos 	adcs	x20,x21,x15
   1223  1.1  christos 	umulh	x15,x11,x4
   1224  1.1  christos 	adcs	x21,x22,x16
   1225  1.1  christos 	umulh	x16,x12,x4
   1226  1.1  christos 	adcs	x22,x23,x17
   1227  1.1  christos 	umulh	x17,x13,x4
   1228  1.1  christos 	ldr	x4,[x0,x27]
   1229  1.1  christos 	adcs	x23,x24,x14
   1230  1.1  christos 	adcs	x24,x25,x15
   1231  1.1  christos 	adcs	x25,x26,x16
   1232  1.1  christos 	adcs	x26,x28,x17
   1233  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1234  1.1  christos 	cbnz	x27,.Lsqr8x_mul
   1235  1.1  christos 					// note that carry flag is guaranteed
   1236  1.1  christos 					// to be zero at this point
   1237  1.1  christos 	cmp	x1,x3		// done yet?
   1238  1.1  christos 	b.eq	.Lsqr8x_break
   1239  1.1  christos 
   1240  1.1  christos 	ldp	x6,x7,[x2,#8*0]
   1241  1.1  christos 	ldp	x8,x9,[x2,#8*2]
   1242  1.1  christos 	ldp	x10,x11,[x2,#8*4]
   1243  1.1  christos 	ldp	x12,x13,[x2,#8*6]
   1244  1.1  christos 	adds	x19,x19,x6
   1245  1.1  christos 	ldur	x4,[x0,#-8*8]
   1246  1.1  christos 	adcs	x20,x20,x7
   1247  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1248  1.1  christos 	adcs	x21,x21,x8
   1249  1.1  christos 	adcs	x22,x22,x9
   1250  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1251  1.1  christos 	adcs	x23,x23,x10
   1252  1.1  christos 	adcs	x24,x24,x11
   1253  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1254  1.1  christos 	adcs	x25,x25,x12
   1255  1.1  christos 	mov	x27,#-8*8
   1256  1.1  christos 	adcs	x26,x26,x13
   1257  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1258  1.1  christos 	add	x1,x1,#8*8
   1259  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1260  1.1  christos 	b	.Lsqr8x_mul
   1261  1.1  christos 
   1262  1.1  christos .align	4
   1263  1.1  christos .Lsqr8x_break:
   1264  1.1  christos 	ldp	x6,x7,[x0,#8*0]
   1265  1.1  christos 	add	x1,x0,#8*8
   1266  1.1  christos 	ldp	x8,x9,[x0,#8*2]
   1267  1.1  christos 	sub	x14,x3,x1		// is it last iteration?
   1268  1.1  christos 	ldp	x10,x11,[x0,#8*4]
   1269  1.1  christos 	sub	x15,x2,x14
   1270  1.1  christos 	ldp	x12,x13,[x0,#8*6]
   1271  1.1  christos 	cbz	x14,.Lsqr8x_outer_loop
   1272  1.1  christos 
   1273  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1274  1.1  christos 	ldp	x19,x20,[x15,#8*0]
   1275  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1276  1.1  christos 	ldp	x21,x22,[x15,#8*2]
   1277  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1278  1.1  christos 	ldp	x23,x24,[x15,#8*4]
   1279  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1280  1.1  christos 	mov	x2,x15
   1281  1.1  christos 	ldp	x25,x26,[x15,#8*6]
   1282  1.1  christos 	b	.Lsqr8x_outer_loop
   1283  1.1  christos 
   1284  1.1  christos .align	4
   1285  1.1  christos .Lsqr8x_outer_break:
   1286  1.1  christos 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
   1287  1.1  christos 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
   1288  1.1  christos 	ldp	x15,x16,[sp,#8*1]
   1289  1.1  christos 	ldp	x11,x13,[x14,#8*2]
   1290  1.1  christos 	add	x1,x14,#8*4
   1291  1.1  christos 	ldp	x17,x14,[sp,#8*3]
   1292  1.1  christos 
   1293  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1294  1.1  christos 	mul	x19,x7,x7
   1295  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1296  1.1  christos 	umulh	x7,x7,x7
   1297  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1298  1.1  christos 	mul	x8,x9,x9
   1299  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1300  1.1  christos 	mov	x2,sp
   1301  1.1  christos 	umulh	x9,x9,x9
   1302  1.1  christos 	adds	x20,x7,x15,lsl#1
   1303  1.1  christos 	extr	x15,x16,x15,#63
   1304  1.1  christos 	sub	x27,x5,#8*4
   1305  1.1  christos 
   1306  1.1  christos .Lsqr4x_shift_n_add:
   1307  1.1  christos 	adcs	x21,x8,x15
   1308  1.1  christos 	extr	x16,x17,x16,#63
   1309  1.1  christos 	sub	x27,x27,#8*4
   1310  1.1  christos 	adcs	x22,x9,x16
   1311  1.1  christos 	ldp	x15,x16,[x2,#8*5]
   1312  1.1  christos 	mul	x10,x11,x11
   1313  1.1  christos 	ldp	x7,x9,[x1],#8*2
   1314  1.1  christos 	umulh	x11,x11,x11
   1315  1.1  christos 	mul	x12,x13,x13
   1316  1.1  christos 	umulh	x13,x13,x13
   1317  1.1  christos 	extr	x17,x14,x17,#63
   1318  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1319  1.1  christos 	adcs	x23,x10,x17
   1320  1.1  christos 	extr	x14,x15,x14,#63
   1321  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1322  1.1  christos 	adcs	x24,x11,x14
   1323  1.1  christos 	ldp	x17,x14,[x2,#8*7]
   1324  1.1  christos 	extr	x15,x16,x15,#63
   1325  1.1  christos 	adcs	x25,x12,x15
   1326  1.1  christos 	extr	x16,x17,x16,#63
   1327  1.1  christos 	adcs	x26,x13,x16
   1328  1.1  christos 	ldp	x15,x16,[x2,#8*9]
   1329  1.1  christos 	mul	x6,x7,x7
   1330  1.1  christos 	ldp	x11,x13,[x1],#8*2
   1331  1.1  christos 	umulh	x7,x7,x7
   1332  1.1  christos 	mul	x8,x9,x9
   1333  1.1  christos 	umulh	x9,x9,x9
   1334  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1335  1.1  christos 	extr	x17,x14,x17,#63
   1336  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1337  1.1  christos 	add	x2,x2,#8*8
   1338  1.1  christos 	adcs	x19,x6,x17
   1339  1.1  christos 	extr	x14,x15,x14,#63
   1340  1.1  christos 	adcs	x20,x7,x14
   1341  1.1  christos 	ldp	x17,x14,[x2,#8*3]
   1342  1.1  christos 	extr	x15,x16,x15,#63
   1343  1.1  christos 	cbnz	x27,.Lsqr4x_shift_n_add
   1344  1.1  christos 	ldp	x1,x4,[x29,#104]	// pull np and n0
   1345  1.1  christos 
   1346  1.1  christos 	adcs	x21,x8,x15
   1347  1.1  christos 	extr	x16,x17,x16,#63
   1348  1.1  christos 	adcs	x22,x9,x16
   1349  1.1  christos 	ldp	x15,x16,[x2,#8*5]
   1350  1.1  christos 	mul	x10,x11,x11
   1351  1.1  christos 	umulh	x11,x11,x11
   1352  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1353  1.1  christos 	mul	x12,x13,x13
   1354  1.1  christos 	umulh	x13,x13,x13
   1355  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1356  1.1  christos 	extr	x17,x14,x17,#63
   1357  1.1  christos 	adcs	x23,x10,x17
   1358  1.1  christos 	extr	x14,x15,x14,#63
   1359  1.1  christos 	ldp	x19,x20,[sp,#8*0]
   1360  1.1  christos 	adcs	x24,x11,x14
   1361  1.1  christos 	extr	x15,x16,x15,#63
   1362  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1363  1.1  christos 	adcs	x25,x12,x15
   1364  1.1  christos 	extr	x16,xzr,x16,#63
   1365  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1366  1.1  christos 	adc	x26,x13,x16
   1367  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1368  1.1  christos 
   1369  1.1  christos 	// Reduce by 512 bits per iteration
   1370  1.1  christos 	mul	x28,x4,x19		// t[0]*n0
   1371  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1372  1.1  christos 	add	x3,x1,x5
   1373  1.1  christos 	ldp	x21,x22,[sp,#8*2]
   1374  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1375  1.1  christos 	ldp	x23,x24,[sp,#8*4]
   1376  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1377  1.1  christos 	ldp	x25,x26,[sp,#8*6]
   1378  1.1  christos 	add	x1,x1,#8*8
   1379  1.1  christos 	mov	x30,xzr		// initial top-most carry
   1380  1.1  christos 	mov	x2,sp
   1381  1.1  christos 	mov	x27,#8
   1382  1.1  christos 
   1383  1.1  christos .Lsqr8x_reduction:
   1384  1.1  christos 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
   1385  1.1  christos 	mul	x15,x7,x28
   1386  1.1  christos 	sub	x27,x27,#1
   1387  1.1  christos 	mul	x16,x8,x28
   1388  1.1  christos 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
   1389  1.1  christos 	mul	x17,x9,x28
   1390  1.1  christos 	// (*)	adds	xzr,x19,x14
   1391  1.1  christos 	subs	xzr,x19,#1		// (*)
   1392  1.1  christos 	mul	x14,x10,x28
   1393  1.1  christos 	adcs	x19,x20,x15
   1394  1.1  christos 	mul	x15,x11,x28
   1395  1.1  christos 	adcs	x20,x21,x16
   1396  1.1  christos 	mul	x16,x12,x28
   1397  1.1  christos 	adcs	x21,x22,x17
   1398  1.1  christos 	mul	x17,x13,x28
   1399  1.1  christos 	adcs	x22,x23,x14
   1400  1.1  christos 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
   1401  1.1  christos 	adcs	x23,x24,x15
   1402  1.1  christos 	umulh	x15,x7,x28
   1403  1.1  christos 	adcs	x24,x25,x16
   1404  1.1  christos 	umulh	x16,x8,x28
   1405  1.1  christos 	adcs	x25,x26,x17
   1406  1.1  christos 	umulh	x17,x9,x28
   1407  1.1  christos 	adc	x26,xzr,xzr
   1408  1.1  christos 	adds	x19,x19,x14
   1409  1.1  christos 	umulh	x14,x10,x28
   1410  1.1  christos 	adcs	x20,x20,x15
   1411  1.1  christos 	umulh	x15,x11,x28
   1412  1.1  christos 	adcs	x21,x21,x16
   1413  1.1  christos 	umulh	x16,x12,x28
   1414  1.1  christos 	adcs	x22,x22,x17
   1415  1.1  christos 	umulh	x17,x13,x28
   1416  1.1  christos 	mul	x28,x4,x19		// next t[0]*n0
   1417  1.1  christos 	adcs	x23,x23,x14
   1418  1.1  christos 	adcs	x24,x24,x15
   1419  1.1  christos 	adcs	x25,x25,x16
   1420  1.1  christos 	adc	x26,x26,x17
   1421  1.1  christos 	cbnz	x27,.Lsqr8x_reduction
   1422  1.1  christos 
   1423  1.1  christos 	ldp	x14,x15,[x2,#8*0]
   1424  1.1  christos 	ldp	x16,x17,[x2,#8*2]
   1425  1.1  christos 	mov	x0,x2
   1426  1.1  christos 	sub	x27,x3,x1	// done yet?
   1427  1.1  christos 	adds	x19,x19,x14
   1428  1.1  christos 	adcs	x20,x20,x15
   1429  1.1  christos 	ldp	x14,x15,[x2,#8*4]
   1430  1.1  christos 	adcs	x21,x21,x16
   1431  1.1  christos 	adcs	x22,x22,x17
   1432  1.1  christos 	ldp	x16,x17,[x2,#8*6]
   1433  1.1  christos 	adcs	x23,x23,x14
   1434  1.1  christos 	adcs	x24,x24,x15
   1435  1.1  christos 	adcs	x25,x25,x16
   1436  1.1  christos 	adcs	x26,x26,x17
   1437  1.1  christos 	//adc	x28,xzr,xzr		// moved below
   1438  1.1  christos 	cbz	x27,.Lsqr8x8_post_condition
   1439  1.1  christos 
   1440  1.1  christos 	ldur	x4,[x2,#-8*8]
   1441  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1442  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1443  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1444  1.1  christos 	mov	x27,#-8*8
   1445  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1446  1.1  christos 	add	x1,x1,#8*8
   1447  1.1  christos 
   1448  1.1  christos .Lsqr8x_tail:
   1449  1.1  christos 	mul	x14,x6,x4
   1450  1.1  christos 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
   1451  1.1  christos 	mul	x15,x7,x4
   1452  1.1  christos 	add	x27,x27,#8
   1453  1.1  christos 	mul	x16,x8,x4
   1454  1.1  christos 	mul	x17,x9,x4
   1455  1.1  christos 	adds	x19,x19,x14
   1456  1.1  christos 	mul	x14,x10,x4
   1457  1.1  christos 	adcs	x20,x20,x15
   1458  1.1  christos 	mul	x15,x11,x4
   1459  1.1  christos 	adcs	x21,x21,x16
   1460  1.1  christos 	mul	x16,x12,x4
   1461  1.1  christos 	adcs	x22,x22,x17
   1462  1.1  christos 	mul	x17,x13,x4
   1463  1.1  christos 	adcs	x23,x23,x14
   1464  1.1  christos 	umulh	x14,x6,x4
   1465  1.1  christos 	adcs	x24,x24,x15
   1466  1.1  christos 	umulh	x15,x7,x4
   1467  1.1  christos 	adcs	x25,x25,x16
   1468  1.1  christos 	umulh	x16,x8,x4
   1469  1.1  christos 	adcs	x26,x26,x17
   1470  1.1  christos 	umulh	x17,x9,x4
   1471  1.1  christos 	adc	x28,x28,xzr
   1472  1.1  christos 	str	x19,[x2],#8
   1473  1.1  christos 	adds	x19,x20,x14
   1474  1.1  christos 	umulh	x14,x10,x4
   1475  1.1  christos 	adcs	x20,x21,x15
   1476  1.1  christos 	umulh	x15,x11,x4
   1477  1.1  christos 	adcs	x21,x22,x16
   1478  1.1  christos 	umulh	x16,x12,x4
   1479  1.1  christos 	adcs	x22,x23,x17
   1480  1.1  christos 	umulh	x17,x13,x4
   1481  1.1  christos 	ldr	x4,[x0,x27]
   1482  1.1  christos 	adcs	x23,x24,x14
   1483  1.1  christos 	adcs	x24,x25,x15
   1484  1.1  christos 	adcs	x25,x26,x16
   1485  1.1  christos 	adcs	x26,x28,x17
   1486  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1487  1.1  christos 	cbnz	x27,.Lsqr8x_tail
   1488  1.1  christos 					// note that carry flag is guaranteed
   1489  1.1  christos 					// to be zero at this point
   1490  1.1  christos 	ldp	x6,x7,[x2,#8*0]
   1491  1.1  christos 	sub	x27,x3,x1	// done yet?
   1492  1.1  christos 	sub	x16,x3,x5	// rewinded np
   1493  1.1  christos 	ldp	x8,x9,[x2,#8*2]
   1494  1.1  christos 	ldp	x10,x11,[x2,#8*4]
   1495  1.1  christos 	ldp	x12,x13,[x2,#8*6]
   1496  1.1  christos 	cbz	x27,.Lsqr8x_tail_break
   1497  1.1  christos 
   1498  1.1  christos 	ldur	x4,[x0,#-8*8]
   1499  1.1  christos 	adds	x19,x19,x6
   1500  1.1  christos 	adcs	x20,x20,x7
   1501  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1502  1.1  christos 	adcs	x21,x21,x8
   1503  1.1  christos 	adcs	x22,x22,x9
   1504  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1505  1.1  christos 	adcs	x23,x23,x10
   1506  1.1  christos 	adcs	x24,x24,x11
   1507  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1508  1.1  christos 	adcs	x25,x25,x12
   1509  1.1  christos 	mov	x27,#-8*8
   1510  1.1  christos 	adcs	x26,x26,x13
   1511  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1512  1.1  christos 	add	x1,x1,#8*8
   1513  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1514  1.1  christos 	b	.Lsqr8x_tail
   1515  1.1  christos 
   1516  1.1  christos .align	4
   1517  1.1  christos .Lsqr8x_tail_break:
   1518  1.1  christos 	ldr	x4,[x29,#112]		// pull n0
   1519  1.1  christos 	add	x27,x2,#8*8		// end of current t[num] window
   1520  1.1  christos 
   1521  1.1  christos 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
   1522  1.1  christos 	adcs	x14,x19,x6
   1523  1.1  christos 	adcs	x15,x20,x7
   1524  1.1  christos 	ldp	x19,x20,[x0,#8*0]
   1525  1.1  christos 	adcs	x21,x21,x8
   1526  1.1  christos 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
   1527  1.1  christos 	adcs	x22,x22,x9
   1528  1.1  christos 	ldp	x8,x9,[x16,#8*2]
   1529  1.1  christos 	adcs	x23,x23,x10
   1530  1.1  christos 	adcs	x24,x24,x11
   1531  1.1  christos 	ldp	x10,x11,[x16,#8*4]
   1532  1.1  christos 	adcs	x25,x25,x12
   1533  1.1  christos 	adcs	x26,x26,x13
   1534  1.1  christos 	ldp	x12,x13,[x16,#8*6]
   1535  1.1  christos 	add	x1,x16,#8*8
   1536  1.1  christos 	adc	x30,xzr,xzr	// top-most carry
   1537  1.1  christos 	mul	x28,x4,x19
   1538  1.1  christos 	stp	x14,x15,[x2,#8*0]
   1539  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1540  1.1  christos 	ldp	x21,x22,[x0,#8*2]
   1541  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1542  1.1  christos 	ldp	x23,x24,[x0,#8*4]
   1543  1.1  christos 	cmp	x27,x29		// did we hit the bottom?
   1544  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1545  1.1  christos 	mov	x2,x0			// slide the window
   1546  1.1  christos 	ldp	x25,x26,[x0,#8*6]
   1547  1.1  christos 	mov	x27,#8
   1548  1.1  christos 	b.ne	.Lsqr8x_reduction
   1549  1.1  christos 
   1550  1.1  christos 	// Final step. We see if result is larger than modulus, and
   1551  1.1  christos 	// if it is, subtract the modulus. But comparison implies
   1552  1.1  christos 	// subtraction. So we subtract modulus, see if it borrowed,
   1553  1.1  christos 	// and conditionally copy original value.
   1554  1.1  christos 	ldr	x0,[x29,#96]		// pull rp
   1555  1.1  christos 	add	x2,x2,#8*8
   1556  1.1  christos 	subs	x14,x19,x6
   1557  1.1  christos 	sbcs	x15,x20,x7
   1558  1.1  christos 	sub	x27,x5,#8*8
   1559  1.1  christos 	mov	x3,x0		// x0 copy
   1560  1.1  christos 
   1561  1.1  christos .Lsqr8x_sub:
   1562  1.1  christos 	sbcs	x16,x21,x8
   1563  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1564  1.1  christos 	sbcs	x17,x22,x9
   1565  1.1  christos 	stp	x14,x15,[x0,#8*0]
   1566  1.1  christos 	sbcs	x14,x23,x10
   1567  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1568  1.1  christos 	sbcs	x15,x24,x11
   1569  1.1  christos 	stp	x16,x17,[x0,#8*2]
   1570  1.1  christos 	sbcs	x16,x25,x12
   1571  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1572  1.1  christos 	sbcs	x17,x26,x13
   1573  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1574  1.1  christos 	add	x1,x1,#8*8
   1575  1.1  christos 	ldp	x19,x20,[x2,#8*0]
   1576  1.1  christos 	sub	x27,x27,#8*8
   1577  1.1  christos 	ldp	x21,x22,[x2,#8*2]
   1578  1.1  christos 	ldp	x23,x24,[x2,#8*4]
   1579  1.1  christos 	ldp	x25,x26,[x2,#8*6]
   1580  1.1  christos 	add	x2,x2,#8*8
   1581  1.1  christos 	stp	x14,x15,[x0,#8*4]
   1582  1.1  christos 	sbcs	x14,x19,x6
   1583  1.1  christos 	stp	x16,x17,[x0,#8*6]
   1584  1.1  christos 	add	x0,x0,#8*8
   1585  1.1  christos 	sbcs	x15,x20,x7
   1586  1.1  christos 	cbnz	x27,.Lsqr8x_sub
   1587  1.1  christos 
   1588  1.1  christos 	sbcs	x16,x21,x8
   1589  1.1  christos 	mov	x2,sp
   1590  1.1  christos 	add	x1,sp,x5
   1591  1.1  christos 	ldp	x6,x7,[x3,#8*0]
   1592  1.1  christos 	sbcs	x17,x22,x9
   1593  1.1  christos 	stp	x14,x15,[x0,#8*0]
   1594  1.1  christos 	sbcs	x14,x23,x10
   1595  1.1  christos 	ldp	x8,x9,[x3,#8*2]
   1596  1.1  christos 	sbcs	x15,x24,x11
   1597  1.1  christos 	stp	x16,x17,[x0,#8*2]
   1598  1.1  christos 	sbcs	x16,x25,x12
   1599  1.1  christos 	ldp	x19,x20,[x1,#8*0]
   1600  1.1  christos 	sbcs	x17,x26,x13
   1601  1.1  christos 	ldp	x21,x22,[x1,#8*2]
   1602  1.1  christos 	sbcs	xzr,x30,xzr	// did it borrow?
   1603  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   1604  1.1  christos 	stp	x14,x15,[x0,#8*4]
   1605  1.1  christos 	stp	x16,x17,[x0,#8*6]
   1606  1.1  christos 
   1607  1.1  christos 	sub	x27,x5,#8*4
   1608  1.1  christos .Lsqr4x_cond_copy:
   1609  1.1  christos 	sub	x27,x27,#8*4
   1610  1.1  christos 	csel	x14,x19,x6,lo
   1611  1.1  christos 	stp	xzr,xzr,[x2,#8*0]
   1612  1.1  christos 	csel	x15,x20,x7,lo
   1613  1.1  christos 	ldp	x6,x7,[x3,#8*4]
   1614  1.1  christos 	ldp	x19,x20,[x1,#8*4]
   1615  1.1  christos 	csel	x16,x21,x8,lo
   1616  1.1  christos 	stp	xzr,xzr,[x2,#8*2]
   1617  1.1  christos 	add	x2,x2,#8*4
   1618  1.1  christos 	csel	x17,x22,x9,lo
   1619  1.1  christos 	ldp	x8,x9,[x3,#8*6]
   1620  1.1  christos 	ldp	x21,x22,[x1,#8*6]
   1621  1.1  christos 	add	x1,x1,#8*4
   1622  1.1  christos 	stp	x14,x15,[x3,#8*0]
   1623  1.1  christos 	stp	x16,x17,[x3,#8*2]
   1624  1.1  christos 	add	x3,x3,#8*4
   1625  1.1  christos 	stp	xzr,xzr,[x1,#8*0]
   1626  1.1  christos 	stp	xzr,xzr,[x1,#8*2]
   1627  1.1  christos 	cbnz	x27,.Lsqr4x_cond_copy
   1628  1.1  christos 
   1629  1.1  christos 	csel	x14,x19,x6,lo
   1630  1.1  christos 	stp	xzr,xzr,[x2,#8*0]
   1631  1.1  christos 	csel	x15,x20,x7,lo
   1632  1.1  christos 	stp	xzr,xzr,[x2,#8*2]
   1633  1.1  christos 	csel	x16,x21,x8,lo
   1634  1.1  christos 	csel	x17,x22,x9,lo
   1635  1.1  christos 	stp	x14,x15,[x3,#8*0]
   1636  1.1  christos 	stp	x16,x17,[x3,#8*2]
   1637  1.1  christos 
   1638  1.1  christos 	b	.Lsqr8x_done
   1639  1.1  christos 
   1640  1.1  christos .align	4
   1641  1.1  christos .Lsqr8x8_post_condition:
   1642  1.1  christos 	adc	x28,xzr,xzr
   1643  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   1644  1.1  christos 	// x19-7,x28 hold result, x6-7 hold modulus
   1645  1.1  christos 	subs	x6,x19,x6
   1646  1.1  christos 	ldr	x1,[x29,#96]		// pull rp
   1647  1.1  christos 	sbcs	x7,x20,x7
   1648  1.1  christos 	stp	xzr,xzr,[sp,#8*0]
   1649  1.1  christos 	sbcs	x8,x21,x8
   1650  1.1  christos 	stp	xzr,xzr,[sp,#8*2]
   1651  1.1  christos 	sbcs	x9,x22,x9
   1652  1.1  christos 	stp	xzr,xzr,[sp,#8*4]
   1653  1.1  christos 	sbcs	x10,x23,x10
   1654  1.1  christos 	stp	xzr,xzr,[sp,#8*6]
   1655  1.1  christos 	sbcs	x11,x24,x11
   1656  1.1  christos 	stp	xzr,xzr,[sp,#8*8]
   1657  1.1  christos 	sbcs	x12,x25,x12
   1658  1.1  christos 	stp	xzr,xzr,[sp,#8*10]
   1659  1.1  christos 	sbcs	x13,x26,x13
   1660  1.1  christos 	stp	xzr,xzr,[sp,#8*12]
   1661  1.1  christos 	sbcs	x28,x28,xzr	// did it borrow?
   1662  1.1  christos 	stp	xzr,xzr,[sp,#8*14]
   1663  1.1  christos 
   1664  1.1  christos 	// x6-7 hold result-modulus
   1665  1.1  christos 	csel	x6,x19,x6,lo
   1666  1.1  christos 	csel	x7,x20,x7,lo
   1667  1.1  christos 	csel	x8,x21,x8,lo
   1668  1.1  christos 	csel	x9,x22,x9,lo
   1669  1.1  christos 	stp	x6,x7,[x1,#8*0]
   1670  1.1  christos 	csel	x10,x23,x10,lo
   1671  1.1  christos 	csel	x11,x24,x11,lo
   1672  1.1  christos 	stp	x8,x9,[x1,#8*2]
   1673  1.1  christos 	csel	x12,x25,x12,lo
   1674  1.1  christos 	csel	x13,x26,x13,lo
   1675  1.1  christos 	stp	x10,x11,[x1,#8*4]
   1676  1.1  christos 	stp	x12,x13,[x1,#8*6]
   1677  1.1  christos 
   1678  1.1  christos .Lsqr8x_done:
   1679  1.1  christos 	ldp	x19,x20,[x29,#16]
   1680  1.1  christos 	mov	sp,x29
   1681  1.1  christos 	ldp	x21,x22,[x29,#32]
   1682  1.1  christos 	mov	x0,#1
   1683  1.1  christos 	ldp	x23,x24,[x29,#48]
   1684  1.1  christos 	ldp	x25,x26,[x29,#64]
   1685  1.1  christos 	ldp	x27,x28,[x29,#80]
   1686  1.1  christos 	ldr	x29,[sp],#128
   1687  1.2  christos 	// x30 is loaded earlier
   1688  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1689  1.1  christos 	ret
   1690  1.1  christos .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
   1691  1.1  christos .type	__bn_mul4x_mont,%function
   1692  1.1  christos .align	5
   1693  1.1  christos __bn_mul4x_mont:
   1694  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
   1695  1.2  christos 	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
   1696  1.1  christos 	stp	x29,x30,[sp,#-128]!
   1697  1.1  christos 	add	x29,sp,#0
   1698  1.1  christos 	stp	x19,x20,[sp,#16]
   1699  1.1  christos 	stp	x21,x22,[sp,#32]
   1700  1.1  christos 	stp	x23,x24,[sp,#48]
   1701  1.1  christos 	stp	x25,x26,[sp,#64]
   1702  1.1  christos 	stp	x27,x28,[sp,#80]
   1703  1.1  christos 
   1704  1.1  christos 	sub	x26,sp,x5,lsl#3
   1705  1.1  christos 	lsl	x5,x5,#3
   1706  1.1  christos 	ldr	x4,[x4]		// *n0
   1707  1.1  christos 	sub	sp,x26,#8*4		// alloca
   1708  1.1  christos 
   1709  1.1  christos 	add	x10,x2,x5
   1710  1.1  christos 	add	x27,x1,x5
   1711  1.1  christos 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
   1712  1.1  christos 
   1713  1.1  christos 	ldr	x24,[x2,#8*0]		// b[0]
   1714  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1715  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1716  1.1  christos 	add	x1,x1,#8*4
   1717  1.1  christos 	mov	x19,xzr
   1718  1.1  christos 	mov	x20,xzr
   1719  1.1  christos 	mov	x21,xzr
   1720  1.1  christos 	mov	x22,xzr
   1721  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1722  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1723  1.1  christos 	adds	x3,x3,#8*4		// clear carry bit
   1724  1.1  christos 	mov	x0,xzr
   1725  1.1  christos 	mov	x28,#0
   1726  1.1  christos 	mov	x26,sp
   1727  1.1  christos 
   1728  1.1  christos .Loop_mul4x_1st_reduction:
   1729  1.1  christos 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
   1730  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1731  1.1  christos 	mul	x11,x7,x24
   1732  1.1  christos 	add	x28,x28,#8
   1733  1.1  christos 	mul	x12,x8,x24
   1734  1.1  christos 	and	x28,x28,#31
   1735  1.1  christos 	mul	x13,x9,x24
   1736  1.1  christos 	adds	x19,x19,x10
   1737  1.1  christos 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
   1738  1.1  christos 	adcs	x20,x20,x11
   1739  1.1  christos 	mul	x25,x19,x4		// t[0]*n0
   1740  1.1  christos 	adcs	x21,x21,x12
   1741  1.1  christos 	umulh	x11,x7,x24
   1742  1.1  christos 	adcs	x22,x22,x13
   1743  1.1  christos 	umulh	x12,x8,x24
   1744  1.1  christos 	adc	x23,xzr,xzr
   1745  1.1  christos 	umulh	x13,x9,x24
   1746  1.1  christos 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1747  1.1  christos 	adds	x20,x20,x10
   1748  1.1  christos 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
   1749  1.1  christos 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1750  1.1  christos 	adcs	x21,x21,x11
   1751  1.1  christos 	mul	x11,x15,x25
   1752  1.1  christos 	adcs	x22,x22,x12
   1753  1.1  christos 	mul	x12,x16,x25
   1754  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1755  1.1  christos 	mul	x13,x17,x25
   1756  1.1  christos 	// (*)	adds	xzr,x19,x10
   1757  1.1  christos 	subs	xzr,x19,#1		// (*)
   1758  1.1  christos 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
   1759  1.1  christos 	adcs	x19,x20,x11
   1760  1.1  christos 	umulh	x11,x15,x25
   1761  1.1  christos 	adcs	x20,x21,x12
   1762  1.1  christos 	umulh	x12,x16,x25
   1763  1.1  christos 	adcs	x21,x22,x13
   1764  1.1  christos 	umulh	x13,x17,x25
   1765  1.1  christos 	adcs	x22,x23,x0
   1766  1.1  christos 	adc	x0,xzr,xzr
   1767  1.1  christos 	adds	x19,x19,x10
   1768  1.1  christos 	sub	x10,x27,x1
   1769  1.1  christos 	adcs	x20,x20,x11
   1770  1.1  christos 	adcs	x21,x21,x12
   1771  1.1  christos 	adcs	x22,x22,x13
   1772  1.1  christos 	//adc	x0,x0,xzr
   1773  1.1  christos 	cbnz	x28,.Loop_mul4x_1st_reduction
   1774  1.1  christos 
   1775  1.1  christos 	cbz	x10,.Lmul4x4_post_condition
   1776  1.1  christos 
   1777  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1778  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1779  1.1  christos 	add	x1,x1,#8*4
   1780  1.1  christos 	ldr	x25,[sp]		// a[0]*n0
   1781  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1782  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1783  1.1  christos 	add	x3,x3,#8*4
   1784  1.1  christos 
   1785  1.1  christos .Loop_mul4x_1st_tail:
   1786  1.1  christos 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
   1787  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1788  1.1  christos 	mul	x11,x7,x24
   1789  1.1  christos 	add	x28,x28,#8
   1790  1.1  christos 	mul	x12,x8,x24
   1791  1.1  christos 	and	x28,x28,#31
   1792  1.1  christos 	mul	x13,x9,x24
   1793  1.1  christos 	adds	x19,x19,x10
   1794  1.1  christos 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
   1795  1.1  christos 	adcs	x20,x20,x11
   1796  1.1  christos 	umulh	x11,x7,x24
   1797  1.1  christos 	adcs	x21,x21,x12
   1798  1.1  christos 	umulh	x12,x8,x24
   1799  1.1  christos 	adcs	x22,x22,x13
   1800  1.1  christos 	umulh	x13,x9,x24
   1801  1.1  christos 	adc	x23,xzr,xzr
   1802  1.1  christos 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1803  1.1  christos 	adds	x20,x20,x10
   1804  1.1  christos 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
   1805  1.1  christos 	adcs	x21,x21,x11
   1806  1.1  christos 	mul	x11,x15,x25
   1807  1.1  christos 	adcs	x22,x22,x12
   1808  1.1  christos 	mul	x12,x16,x25
   1809  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1810  1.1  christos 	mul	x13,x17,x25
   1811  1.1  christos 	adds	x19,x19,x10
   1812  1.1  christos 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
   1813  1.1  christos 	adcs	x20,x20,x11
   1814  1.1  christos 	umulh	x11,x15,x25
   1815  1.1  christos 	adcs	x21,x21,x12
   1816  1.1  christos 	umulh	x12,x16,x25
   1817  1.1  christos 	adcs	x22,x22,x13
   1818  1.1  christos 	adcs	x23,x23,x0
   1819  1.1  christos 	umulh	x13,x17,x25
   1820  1.1  christos 	adc	x0,xzr,xzr
   1821  1.1  christos 	ldr	x25,[sp,x28]		// next t[0]*n0
   1822  1.1  christos 	str	x19,[x26],#8		// result!!!
   1823  1.1  christos 	adds	x19,x20,x10
   1824  1.1  christos 	sub	x10,x27,x1		// done yet?
   1825  1.1  christos 	adcs	x20,x21,x11
   1826  1.1  christos 	adcs	x21,x22,x12
   1827  1.1  christos 	adcs	x22,x23,x13
   1828  1.1  christos 	//adc	x0,x0,xzr
   1829  1.1  christos 	cbnz	x28,.Loop_mul4x_1st_tail
   1830  1.1  christos 
   1831  1.1  christos 	sub	x11,x27,x5	// rewinded x1
   1832  1.1  christos 	cbz	x10,.Lmul4x_proceed
   1833  1.1  christos 
   1834  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1835  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1836  1.1  christos 	add	x1,x1,#8*4
   1837  1.1  christos 	ldp	x14,x15,[x3,#8*0]
   1838  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1839  1.1  christos 	add	x3,x3,#8*4
   1840  1.1  christos 	b	.Loop_mul4x_1st_tail
   1841  1.1  christos 
   1842  1.1  christos .align	5
   1843  1.1  christos .Lmul4x_proceed:
   1844  1.1  christos 	ldr	x24,[x2,#8*4]!		// *++b
   1845  1.1  christos 	adc	x30,x0,xzr
   1846  1.1  christos 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
   1847  1.1  christos 	sub	x3,x3,x5		// rewind np
   1848  1.1  christos 	ldp	x8,x9,[x11,#8*2]
   1849  1.1  christos 	add	x1,x11,#8*4
   1850  1.1  christos 
   1851  1.1  christos 	stp	x19,x20,[x26,#8*0]	// result!!!
   1852  1.1  christos 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1853  1.1  christos 	stp	x21,x22,[x26,#8*2]	// result!!!
   1854  1.1  christos 	ldp	x21,x22,[sp,#8*6]
   1855  1.1  christos 
   1856  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1857  1.1  christos 	mov	x26,sp
   1858  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1859  1.1  christos 	adds	x3,x3,#8*4		// clear carry bit
   1860  1.1  christos 	mov	x0,xzr
   1861  1.1  christos 
   1862  1.1  christos .align	4
   1863  1.1  christos .Loop_mul4x_reduction:
   1864  1.1  christos 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
   1865  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1866  1.1  christos 	mul	x11,x7,x24
   1867  1.1  christos 	add	x28,x28,#8
   1868  1.1  christos 	mul	x12,x8,x24
   1869  1.1  christos 	and	x28,x28,#31
   1870  1.1  christos 	mul	x13,x9,x24
   1871  1.1  christos 	adds	x19,x19,x10
   1872  1.1  christos 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
   1873  1.1  christos 	adcs	x20,x20,x11
   1874  1.1  christos 	mul	x25,x19,x4		// t[0]*n0
   1875  1.1  christos 	adcs	x21,x21,x12
   1876  1.1  christos 	umulh	x11,x7,x24
   1877  1.1  christos 	adcs	x22,x22,x13
   1878  1.1  christos 	umulh	x12,x8,x24
   1879  1.1  christos 	adc	x23,xzr,xzr
   1880  1.1  christos 	umulh	x13,x9,x24
   1881  1.1  christos 	ldr	x24,[x2,x28]		// next b[i]
   1882  1.1  christos 	adds	x20,x20,x10
   1883  1.1  christos 	// (*)	mul	x10,x14,x25
   1884  1.1  christos 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1885  1.1  christos 	adcs	x21,x21,x11
   1886  1.1  christos 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
   1887  1.1  christos 	adcs	x22,x22,x12
   1888  1.1  christos 	mul	x12,x16,x25
   1889  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1890  1.1  christos 	mul	x13,x17,x25
   1891  1.1  christos 	// (*)	adds	xzr,x19,x10
   1892  1.1  christos 	subs	xzr,x19,#1		// (*)
   1893  1.1  christos 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
   1894  1.1  christos 	adcs	x19,x20,x11
   1895  1.1  christos 	umulh	x11,x15,x25
   1896  1.1  christos 	adcs	x20,x21,x12
   1897  1.1  christos 	umulh	x12,x16,x25
   1898  1.1  christos 	adcs	x21,x22,x13
   1899  1.1  christos 	umulh	x13,x17,x25
   1900  1.1  christos 	adcs	x22,x23,x0
   1901  1.1  christos 	adc	x0,xzr,xzr
   1902  1.1  christos 	adds	x19,x19,x10
   1903  1.1  christos 	adcs	x20,x20,x11
   1904  1.1  christos 	adcs	x21,x21,x12
   1905  1.1  christos 	adcs	x22,x22,x13
   1906  1.1  christos 	//adc	x0,x0,xzr
   1907  1.1  christos 	cbnz	x28,.Loop_mul4x_reduction
   1908  1.1  christos 
   1909  1.1  christos 	adc	x0,x0,xzr
   1910  1.1  christos 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
   1911  1.1  christos 	ldp	x12,x13,[x26,#8*6]
   1912  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1913  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1914  1.1  christos 	add	x1,x1,#8*4
   1915  1.1  christos 	adds	x19,x19,x10
   1916  1.1  christos 	adcs	x20,x20,x11
   1917  1.1  christos 	adcs	x21,x21,x12
   1918  1.1  christos 	adcs	x22,x22,x13
   1919  1.1  christos 	//adc	x0,x0,xzr
   1920  1.1  christos 
   1921  1.1  christos 	ldr	x25,[sp]		// t[0]*n0
   1922  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1923  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1924  1.1  christos 	add	x3,x3,#8*4
   1925  1.1  christos 
   1926  1.1  christos .align	4
   1927  1.1  christos .Loop_mul4x_tail:
   1928  1.1  christos 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
   1929  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1930  1.1  christos 	mul	x11,x7,x24
   1931  1.1  christos 	add	x28,x28,#8
   1932  1.1  christos 	mul	x12,x8,x24
   1933  1.1  christos 	and	x28,x28,#31
   1934  1.1  christos 	mul	x13,x9,x24
   1935  1.1  christos 	adds	x19,x19,x10
   1936  1.1  christos 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
   1937  1.1  christos 	adcs	x20,x20,x11
   1938  1.1  christos 	umulh	x11,x7,x24
   1939  1.1  christos 	adcs	x21,x21,x12
   1940  1.1  christos 	umulh	x12,x8,x24
   1941  1.1  christos 	adcs	x22,x22,x13
   1942  1.1  christos 	umulh	x13,x9,x24
   1943  1.1  christos 	adc	x23,xzr,xzr
   1944  1.1  christos 	ldr	x24,[x2,x28]		// next b[i]
   1945  1.1  christos 	adds	x20,x20,x10
   1946  1.1  christos 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
   1947  1.1  christos 	adcs	x21,x21,x11
   1948  1.1  christos 	mul	x11,x15,x25
   1949  1.1  christos 	adcs	x22,x22,x12
   1950  1.1  christos 	mul	x12,x16,x25
   1951  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1952  1.1  christos 	mul	x13,x17,x25
   1953  1.1  christos 	adds	x19,x19,x10
   1954  1.1  christos 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
   1955  1.1  christos 	adcs	x20,x20,x11
   1956  1.1  christos 	umulh	x11,x15,x25
   1957  1.1  christos 	adcs	x21,x21,x12
   1958  1.1  christos 	umulh	x12,x16,x25
   1959  1.1  christos 	adcs	x22,x22,x13
   1960  1.1  christos 	umulh	x13,x17,x25
   1961  1.1  christos 	adcs	x23,x23,x0
   1962  1.1  christos 	ldr	x25,[sp,x28]		// next a[0]*n0
   1963  1.1  christos 	adc	x0,xzr,xzr
   1964  1.1  christos 	str	x19,[x26],#8		// result!!!
   1965  1.1  christos 	adds	x19,x20,x10
   1966  1.1  christos 	sub	x10,x27,x1		// done yet?
   1967  1.1  christos 	adcs	x20,x21,x11
   1968  1.1  christos 	adcs	x21,x22,x12
   1969  1.1  christos 	adcs	x22,x23,x13
   1970  1.1  christos 	//adc	x0,x0,xzr
   1971  1.1  christos 	cbnz	x28,.Loop_mul4x_tail
   1972  1.1  christos 
   1973  1.1  christos 	sub	x11,x3,x5		// rewinded np?
   1974  1.1  christos 	adc	x0,x0,xzr
   1975  1.1  christos 	cbz	x10,.Loop_mul4x_break
   1976  1.1  christos 
   1977  1.1  christos 	ldp	x10,x11,[x26,#8*4]
   1978  1.1  christos 	ldp	x12,x13,[x26,#8*6]
   1979  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1980  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1981  1.1  christos 	add	x1,x1,#8*4
   1982  1.1  christos 	adds	x19,x19,x10
   1983  1.1  christos 	adcs	x20,x20,x11
   1984  1.1  christos 	adcs	x21,x21,x12
   1985  1.1  christos 	adcs	x22,x22,x13
   1986  1.1  christos 	//adc	x0,x0,xzr
   1987  1.1  christos 	ldp	x14,x15,[x3,#8*0]
   1988  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1989  1.1  christos 	add	x3,x3,#8*4
   1990  1.1  christos 	b	.Loop_mul4x_tail
   1991  1.1  christos 
   1992  1.1  christos .align	4
   1993  1.1  christos .Loop_mul4x_break:
   1994  1.1  christos 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
   1995  1.1  christos 	adds	x19,x19,x30
   1996  1.1  christos 	add	x2,x2,#8*4		// bp++
   1997  1.1  christos 	adcs	x20,x20,xzr
   1998  1.1  christos 	sub	x1,x1,x5		// rewind ap
   1999  1.1  christos 	adcs	x21,x21,xzr
   2000  1.1  christos 	stp	x19,x20,[x26,#8*0]	// result!!!
   2001  1.1  christos 	adcs	x22,x22,xzr
   2002  1.1  christos 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   2003  1.1  christos 	adc	x30,x0,xzr
   2004  1.1  christos 	stp	x21,x22,[x26,#8*2]	// result!!!
   2005  1.1  christos 	cmp	x2,x13			// done yet?
   2006  1.1  christos 	ldp	x21,x22,[sp,#8*6]
   2007  1.1  christos 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
   2008  1.1  christos 	ldp	x16,x17,[x11,#8*2]
   2009  1.1  christos 	add	x3,x11,#8*4
   2010  1.1  christos 	b.eq	.Lmul4x_post
   2011  1.1  christos 
   2012  1.1  christos 	ldr	x24,[x2]
   2013  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   2014  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   2015  1.1  christos 	adds	x1,x1,#8*4		// clear carry bit
   2016  1.1  christos 	mov	x0,xzr
   2017  1.1  christos 	mov	x26,sp
   2018  1.1  christos 	b	.Loop_mul4x_reduction
   2019  1.1  christos 
   2020  1.1  christos .align	4
   2021  1.1  christos .Lmul4x_post:
   2022  1.1  christos 	// Final step. We see if result is larger than modulus, and
   2023  1.1  christos 	// if it is, subtract the modulus. But comparison implies
   2024  1.1  christos 	// subtraction. So we subtract modulus, see if it borrowed,
   2025  1.1  christos 	// and conditionally copy original value.
   2026  1.1  christos 	mov	x0,x12
   2027  1.1  christos 	mov	x27,x12		// x0 copy
   2028  1.1  christos 	subs	x10,x19,x14
   2029  1.1  christos 	add	x26,sp,#8*8
   2030  1.1  christos 	sbcs	x11,x20,x15
   2031  1.1  christos 	sub	x28,x5,#8*4
   2032  1.1  christos 
   2033  1.1  christos .Lmul4x_sub:
   2034  1.1  christos 	sbcs	x12,x21,x16
   2035  1.1  christos 	ldp	x14,x15,[x3,#8*0]
   2036  1.1  christos 	sub	x28,x28,#8*4
   2037  1.1  christos 	ldp	x19,x20,[x26,#8*0]
   2038  1.1  christos 	sbcs	x13,x22,x17
   2039  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   2040  1.1  christos 	add	x3,x3,#8*4
   2041  1.1  christos 	ldp	x21,x22,[x26,#8*2]
   2042  1.1  christos 	add	x26,x26,#8*4
   2043  1.1  christos 	stp	x10,x11,[x0,#8*0]
   2044  1.1  christos 	sbcs	x10,x19,x14
   2045  1.1  christos 	stp	x12,x13,[x0,#8*2]
   2046  1.1  christos 	add	x0,x0,#8*4
   2047  1.1  christos 	sbcs	x11,x20,x15
   2048  1.1  christos 	cbnz	x28,.Lmul4x_sub
   2049  1.1  christos 
   2050  1.1  christos 	sbcs	x12,x21,x16
   2051  1.1  christos 	mov	x26,sp
   2052  1.1  christos 	add	x1,sp,#8*4
   2053  1.1  christos 	ldp	x6,x7,[x27,#8*0]
   2054  1.1  christos 	sbcs	x13,x22,x17
   2055  1.1  christos 	stp	x10,x11,[x0,#8*0]
   2056  1.1  christos 	ldp	x8,x9,[x27,#8*2]
   2057  1.1  christos 	stp	x12,x13,[x0,#8*2]
   2058  1.1  christos 	ldp	x19,x20,[x1,#8*0]
   2059  1.1  christos 	ldp	x21,x22,[x1,#8*2]
   2060  1.1  christos 	sbcs	xzr,x30,xzr	// did it borrow?
   2061  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   2062  1.1  christos 
   2063  1.1  christos 	sub	x28,x5,#8*4
   2064  1.1  christos .Lmul4x_cond_copy:
   2065  1.1  christos 	sub	x28,x28,#8*4
   2066  1.1  christos 	csel	x10,x19,x6,lo
   2067  1.1  christos 	stp	xzr,xzr,[x26,#8*0]
   2068  1.1  christos 	csel	x11,x20,x7,lo
   2069  1.1  christos 	ldp	x6,x7,[x27,#8*4]
   2070  1.1  christos 	ldp	x19,x20,[x1,#8*4]
   2071  1.1  christos 	csel	x12,x21,x8,lo
   2072  1.1  christos 	stp	xzr,xzr,[x26,#8*2]
   2073  1.1  christos 	add	x26,x26,#8*4
   2074  1.1  christos 	csel	x13,x22,x9,lo
   2075  1.1  christos 	ldp	x8,x9,[x27,#8*6]
   2076  1.1  christos 	ldp	x21,x22,[x1,#8*6]
   2077  1.1  christos 	add	x1,x1,#8*4
   2078  1.1  christos 	stp	x10,x11,[x27,#8*0]
   2079  1.1  christos 	stp	x12,x13,[x27,#8*2]
   2080  1.1  christos 	add	x27,x27,#8*4
   2081  1.1  christos 	cbnz	x28,.Lmul4x_cond_copy
   2082  1.1  christos 
   2083  1.1  christos 	csel	x10,x19,x6,lo
   2084  1.1  christos 	stp	xzr,xzr,[x26,#8*0]
   2085  1.1  christos 	csel	x11,x20,x7,lo
   2086  1.1  christos 	stp	xzr,xzr,[x26,#8*2]
   2087  1.1  christos 	csel	x12,x21,x8,lo
   2088  1.1  christos 	stp	xzr,xzr,[x26,#8*3]
   2089  1.1  christos 	csel	x13,x22,x9,lo
   2090  1.1  christos 	stp	xzr,xzr,[x26,#8*4]
   2091  1.1  christos 	stp	x10,x11,[x27,#8*0]
   2092  1.1  christos 	stp	x12,x13,[x27,#8*2]
   2093  1.1  christos 
   2094  1.1  christos 	b	.Lmul4x_done
   2095  1.1  christos 
   2096  1.1  christos .align	4
   2097  1.1  christos .Lmul4x4_post_condition:
   2098  1.1  christos 	adc	x0,x0,xzr
   2099  1.1  christos 	ldr	x1,[x29,#96]		// pull rp
   2100  1.1  christos 	// x19-3,x0 hold result, x14-7 hold modulus
   2101  1.1  christos 	subs	x6,x19,x14
   2102  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   2103  1.1  christos 	sbcs	x7,x20,x15
   2104  1.1  christos 	stp	xzr,xzr,[sp,#8*0]
   2105  1.1  christos 	sbcs	x8,x21,x16
   2106  1.1  christos 	stp	xzr,xzr,[sp,#8*2]
   2107  1.1  christos 	sbcs	x9,x22,x17
   2108  1.1  christos 	stp	xzr,xzr,[sp,#8*4]
   2109  1.1  christos 	sbcs	xzr,x0,xzr		// did it borrow?
   2110  1.1  christos 	stp	xzr,xzr,[sp,#8*6]
   2111  1.1  christos 
   2112  1.1  christos 	// x6-3 hold result-modulus
   2113  1.1  christos 	csel	x6,x19,x6,lo
   2114  1.1  christos 	csel	x7,x20,x7,lo
   2115  1.1  christos 	csel	x8,x21,x8,lo
   2116  1.1  christos 	csel	x9,x22,x9,lo
   2117  1.1  christos 	stp	x6,x7,[x1,#8*0]
   2118  1.1  christos 	stp	x8,x9,[x1,#8*2]
   2119  1.1  christos 
   2120  1.1  christos .Lmul4x_done:
   2121  1.1  christos 	ldp	x19,x20,[x29,#16]
   2122  1.1  christos 	mov	sp,x29
   2123  1.1  christos 	ldp	x21,x22,[x29,#32]
   2124  1.1  christos 	mov	x0,#1
   2125  1.1  christos 	ldp	x23,x24,[x29,#48]
   2126  1.1  christos 	ldp	x25,x26,[x29,#64]
   2127  1.1  christos 	ldp	x27,x28,[x29,#80]
   2128  1.1  christos 	ldr	x29,[sp],#128
   2129  1.2  christos 	// x30 loaded earlier
   2130  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   2131  1.1  christos 	ret
   2132  1.1  christos .size	__bn_mul4x_mont,.-__bn_mul4x_mont
   2133  1.2  christos .section	.rodata
   2134  1.1  christos .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   2135  1.1  christos .align	2
   2136  1.1  christos .align	4
   2137