Home | History | Annotate | Line # | Download | only in arm
      1  1.1  christos #include "arm_asm.h"
      2  1.2  christos #include "arm_arch.h"
      3  1.1  christos #ifndef	__KERNEL__
      4  1.1  christos 
      5  1.1  christos .hidden	OPENSSL_armv8_rsa_neonized
      6  1.1  christos #endif
      7  1.1  christos .text
      8  1.1  christos 
      9  1.1  christos .globl	bn_mul_mont
     10  1.1  christos .type	bn_mul_mont,%function
     11  1.1  christos .align	5
     12  1.1  christos bn_mul_mont:
     13  1.2  christos 	AARCH64_SIGN_LINK_REGISTER
     14  1.1  christos .Lbn_mul_mont:
     15  1.1  christos 	tst	x5,#3
     16  1.1  christos 	b.ne	.Lmul_mont
     17  1.1  christos 	cmp	x5,#32
     18  1.1  christos 	b.le	.Lscalar_impl
     19  1.1  christos #ifndef	__KERNEL__
     20  1.1  christos #ifndef	__AARCH64EB__
     21  1.1  christos 	adrp	x17,OPENSSL_armv8_rsa_neonized
     22  1.1  christos 	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
     23  1.1  christos 	cbnz	w17, bn_mul8x_mont_neon
     24  1.1  christos #endif
     25  1.1  christos #endif
     26  1.1  christos 
     27  1.1  christos .Lscalar_impl:
     28  1.1  christos 	tst	x5,#7
     29  1.1  christos 	b.eq	__bn_sqr8x_mont
     30  1.1  christos 	tst	x5,#3
     31  1.1  christos 	b.eq	__bn_mul4x_mont
     32  1.1  christos 
     33  1.1  christos .Lmul_mont:
     34  1.1  christos 	stp	x29,x30,[sp,#-64]!
     35  1.1  christos 	add	x29,sp,#0
     36  1.1  christos 	stp	x19,x20,[sp,#16]
     37  1.1  christos 	stp	x21,x22,[sp,#32]
     38  1.1  christos 	stp	x23,x24,[sp,#48]
     39  1.1  christos 
     40  1.1  christos 	ldr	x9,[x2],#8		// bp[0]
     41  1.1  christos 	sub	x22,sp,x5,lsl#3
     42  1.1  christos 	ldp	x7,x8,[x1],#16	// ap[0..1]
     43  1.1  christos 	lsl	x5,x5,#3
     44  1.1  christos 	ldr	x4,[x4]		// *n0
     45  1.1  christos 	and	x22,x22,#-16		// ABI says so
     46  1.1  christos 	ldp	x13,x14,[x3],#16	// np[0..1]
     47  1.1  christos 
     48  1.1  christos 	mul	x6,x7,x9		// ap[0]*bp[0]
     49  1.1  christos 	sub	x21,x5,#16		// j=num-2
     50  1.1  christos 	umulh	x7,x7,x9
     51  1.1  christos 	mul	x10,x8,x9		// ap[1]*bp[0]
     52  1.1  christos 	umulh	x11,x8,x9
     53  1.1  christos 
     54  1.1  christos 	mul	x15,x6,x4		// "tp[0]"*n0
     55  1.1  christos 	mov	sp,x22			// alloca
     56  1.1  christos 
     57  1.1  christos 	// (*)	mul	x12,x13,x15	// np[0]*m1
     58  1.1  christos 	umulh	x13,x13,x15
     59  1.1  christos 	mul	x16,x14,x15		// np[1]*m1
     60  1.1  christos 	// (*)	adds	x12,x12,x6	// discarded
     61  1.1  christos 	// (*)	As for removal of first multiplication and addition
     62  1.1  christos 	//	instructions. The outcome of first addition is
     63  1.1  christos 	//	guaranteed to be zero, which leaves two computationally
     64  1.1  christos 	//	significant outcomes: it either carries or not. Then
     65  1.1  christos 	//	question is when does it carry? Is there alternative
     66  1.1  christos 	//	way to deduce it? If you follow operations, you can
     67  1.1  christos 	//	observe that condition for carry is quite simple:
     68  1.1  christos 	//	x6 being non-zero. So that carry can be calculated
     69  1.1  christos 	//	by adding -1 to x6. That's what next instruction does.
     70  1.1  christos 	subs	xzr,x6,#1		// (*)
     71  1.1  christos 	umulh	x17,x14,x15
     72  1.1  christos 	adc	x13,x13,xzr
     73  1.1  christos 	cbz	x21,.L1st_skip
     74  1.1  christos 
     75  1.1  christos .L1st:
     76  1.1  christos 	ldr	x8,[x1],#8
     77  1.1  christos 	adds	x6,x10,x7
     78  1.1  christos 	sub	x21,x21,#8		// j--
     79  1.1  christos 	adc	x7,x11,xzr
     80  1.1  christos 
     81  1.1  christos 	ldr	x14,[x3],#8
     82  1.1  christos 	adds	x12,x16,x13
     83  1.1  christos 	mul	x10,x8,x9		// ap[j]*bp[0]
     84  1.1  christos 	adc	x13,x17,xzr
     85  1.1  christos 	umulh	x11,x8,x9
     86  1.1  christos 
     87  1.1  christos 	adds	x12,x12,x6
     88  1.1  christos 	mul	x16,x14,x15		// np[j]*m1
     89  1.1  christos 	adc	x13,x13,xzr
     90  1.1  christos 	umulh	x17,x14,x15
     91  1.1  christos 	str	x12,[x22],#8		// tp[j-1]
     92  1.1  christos 	cbnz	x21,.L1st
     93  1.1  christos 
     94  1.1  christos .L1st_skip:
     95  1.1  christos 	adds	x6,x10,x7
     96  1.1  christos 	sub	x1,x1,x5		// rewind x1
     97  1.1  christos 	adc	x7,x11,xzr
     98  1.1  christos 
     99  1.1  christos 	adds	x12,x16,x13
    100  1.1  christos 	sub	x3,x3,x5		// rewind x3
    101  1.1  christos 	adc	x13,x17,xzr
    102  1.1  christos 
    103  1.1  christos 	adds	x12,x12,x6
    104  1.1  christos 	sub	x20,x5,#8		// i=num-1
    105  1.1  christos 	adcs	x13,x13,x7
    106  1.1  christos 
    107  1.1  christos 	adc	x19,xzr,xzr		// upmost overflow bit
    108  1.1  christos 	stp	x12,x13,[x22]
    109  1.1  christos 
    110  1.1  christos .Louter:
    111  1.1  christos 	ldr	x9,[x2],#8		// bp[i]
    112  1.1  christos 	ldp	x7,x8,[x1],#16
    113  1.1  christos 	ldr	x23,[sp]		// tp[0]
    114  1.1  christos 	add	x22,sp,#8
    115  1.1  christos 
    116  1.1  christos 	mul	x6,x7,x9		// ap[0]*bp[i]
    117  1.1  christos 	sub	x21,x5,#16		// j=num-2
    118  1.1  christos 	umulh	x7,x7,x9
    119  1.1  christos 	ldp	x13,x14,[x3],#16
    120  1.1  christos 	mul	x10,x8,x9		// ap[1]*bp[i]
    121  1.1  christos 	adds	x6,x6,x23
    122  1.1  christos 	umulh	x11,x8,x9
    123  1.1  christos 	adc	x7,x7,xzr
    124  1.1  christos 
    125  1.1  christos 	mul	x15,x6,x4
    126  1.1  christos 	sub	x20,x20,#8		// i--
    127  1.1  christos 
    128  1.1  christos 	// (*)	mul	x12,x13,x15	// np[0]*m1
    129  1.1  christos 	umulh	x13,x13,x15
    130  1.1  christos 	mul	x16,x14,x15		// np[1]*m1
    131  1.1  christos 	// (*)	adds	x12,x12,x6
    132  1.1  christos 	subs	xzr,x6,#1		// (*)
    133  1.1  christos 	umulh	x17,x14,x15
    134  1.1  christos 	cbz	x21,.Linner_skip
    135  1.1  christos 
    136  1.1  christos .Linner:
    137  1.1  christos 	ldr	x8,[x1],#8
    138  1.1  christos 	adc	x13,x13,xzr
    139  1.1  christos 	ldr	x23,[x22],#8		// tp[j]
    140  1.1  christos 	adds	x6,x10,x7
    141  1.1  christos 	sub	x21,x21,#8		// j--
    142  1.1  christos 	adc	x7,x11,xzr
    143  1.1  christos 
    144  1.1  christos 	adds	x12,x16,x13
    145  1.1  christos 	ldr	x14,[x3],#8
    146  1.1  christos 	adc	x13,x17,xzr
    147  1.1  christos 
    148  1.1  christos 	mul	x10,x8,x9		// ap[j]*bp[i]
    149  1.1  christos 	adds	x6,x6,x23
    150  1.1  christos 	umulh	x11,x8,x9
    151  1.1  christos 	adc	x7,x7,xzr
    152  1.1  christos 
    153  1.1  christos 	mul	x16,x14,x15		// np[j]*m1
    154  1.1  christos 	adds	x12,x12,x6
    155  1.1  christos 	umulh	x17,x14,x15
    156  1.1  christos 	stur	x12,[x22,#-16]		// tp[j-1]
    157  1.1  christos 	cbnz	x21,.Linner
    158  1.1  christos 
    159  1.1  christos .Linner_skip:
    160  1.1  christos 	ldr	x23,[x22],#8		// tp[j]
    161  1.1  christos 	adc	x13,x13,xzr
    162  1.1  christos 	adds	x6,x10,x7
    163  1.1  christos 	sub	x1,x1,x5		// rewind x1
    164  1.1  christos 	adc	x7,x11,xzr
    165  1.1  christos 
    166  1.1  christos 	adds	x12,x16,x13
    167  1.1  christos 	sub	x3,x3,x5		// rewind x3
    168  1.1  christos 	adcs	x13,x17,x19
    169  1.1  christos 	adc	x19,xzr,xzr
    170  1.1  christos 
    171  1.1  christos 	adds	x6,x6,x23
    172  1.1  christos 	adc	x7,x7,xzr
    173  1.1  christos 
    174  1.1  christos 	adds	x12,x12,x6
    175  1.1  christos 	adcs	x13,x13,x7
    176  1.1  christos 	adc	x19,x19,xzr		// upmost overflow bit
    177  1.1  christos 	stp	x12,x13,[x22,#-16]
    178  1.1  christos 
    179  1.1  christos 	cbnz	x20,.Louter
    180  1.1  christos 
    181  1.1  christos 	// Final step. We see if result is larger than modulus, and
    182  1.1  christos 	// if it is, subtract the modulus. But comparison implies
    183  1.1  christos 	// subtraction. So we subtract modulus, see if it borrowed,
    184  1.1  christos 	// and conditionally copy original value.
    185  1.1  christos 	ldr	x23,[sp]		// tp[0]
    186  1.1  christos 	add	x22,sp,#8
    187  1.1  christos 	ldr	x14,[x3],#8		// np[0]
    188  1.1  christos 	subs	x21,x5,#8		// j=num-1 and clear borrow
    189  1.1  christos 	mov	x1,x0
    190  1.1  christos .Lsub:
    191  1.1  christos 	sbcs	x8,x23,x14		// tp[j]-np[j]
    192  1.1  christos 	ldr	x23,[x22],#8
    193  1.1  christos 	sub	x21,x21,#8		// j--
    194  1.1  christos 	ldr	x14,[x3],#8
    195  1.1  christos 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
    196  1.1  christos 	cbnz	x21,.Lsub
    197  1.1  christos 
    198  1.1  christos 	sbcs	x8,x23,x14
    199  1.1  christos 	sbcs	x19,x19,xzr		// did it borrow?
    200  1.1  christos 	str	x8,[x1],#8		// rp[num-1]
    201  1.1  christos 
    202  1.1  christos 	ldr	x23,[sp]		// tp[0]
    203  1.1  christos 	add	x22,sp,#8
    204  1.1  christos 	ldr	x8,[x0],#8		// rp[0]
    205  1.1  christos 	sub	x5,x5,#8		// num--
    206  1.1  christos 	nop
    207  1.1  christos .Lcond_copy:
    208  1.1  christos 	sub	x5,x5,#8		// num--
    209  1.1  christos 	csel	x14,x23,x8,lo		// did it borrow?
    210  1.1  christos 	ldr	x23,[x22],#8
    211  1.1  christos 	ldr	x8,[x0],#8
    212  1.1  christos 	stur	xzr,[x22,#-16]		// wipe tp
    213  1.1  christos 	stur	x14,[x0,#-16]
    214  1.1  christos 	cbnz	x5,.Lcond_copy
    215  1.1  christos 
    216  1.1  christos 	csel	x14,x23,x8,lo
    217  1.1  christos 	stur	xzr,[x22,#-8]		// wipe tp
    218  1.1  christos 	stur	x14,[x0,#-8]
    219  1.1  christos 
    220  1.1  christos 	ldp	x19,x20,[x29,#16]
    221  1.1  christos 	mov	sp,x29
    222  1.1  christos 	ldp	x21,x22,[x29,#32]
    223  1.1  christos 	mov	x0,#1
    224  1.1  christos 	ldp	x23,x24,[x29,#48]
    225  1.1  christos 	ldr	x29,[sp],#64
    226  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    227  1.1  christos 	ret
    228  1.1  christos .size	bn_mul_mont,.-bn_mul_mont
    229  1.1  christos .type	bn_mul8x_mont_neon,%function
    230  1.1  christos .align	5
    231  1.1  christos bn_mul8x_mont_neon:
    232  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
    233  1.2  christos 	// only from bn_mul_mont which has already signed the return address.
    234  1.1  christos 	stp	x29,x30,[sp,#-80]!
    235  1.1  christos 	mov	x16,sp
    236  1.1  christos 	stp	d8,d9,[sp,#16]
    237  1.1  christos 	stp	d10,d11,[sp,#32]
    238  1.1  christos 	stp	d12,d13,[sp,#48]
    239  1.1  christos 	stp	d14,d15,[sp,#64]
    240  1.1  christos 	lsl	x5,x5,#1
    241  1.1  christos 	eor	v14.16b,v14.16b,v14.16b
    242  1.1  christos 
    243  1.1  christos .align	4
    244  1.1  christos .LNEON_8n:
    245  1.1  christos 	eor	v6.16b,v6.16b,v6.16b
    246  1.1  christos 	sub	x7,sp,#128
    247  1.1  christos 	eor	v7.16b,v7.16b,v7.16b
    248  1.1  christos 	sub	x7,x7,x5,lsl#4
    249  1.1  christos 	eor	v8.16b,v8.16b,v8.16b
    250  1.1  christos 	and	x7,x7,#-64
    251  1.1  christos 	eor	v9.16b,v9.16b,v9.16b
    252  1.1  christos 	mov	sp,x7		// alloca
    253  1.1  christos 	eor	v10.16b,v10.16b,v10.16b
    254  1.1  christos 	add	x7,x7,#256
    255  1.1  christos 	eor	v11.16b,v11.16b,v11.16b
    256  1.1  christos 	sub	x8,x5,#8
    257  1.1  christos 	eor	v12.16b,v12.16b,v12.16b
    258  1.1  christos 	eor	v13.16b,v13.16b,v13.16b
    259  1.1  christos 
    260  1.1  christos .LNEON_8n_init:
    261  1.1  christos 	st1	{v6.2d,v7.2d},[x7],#32
    262  1.1  christos 	subs	x8,x8,#8
    263  1.1  christos 	st1	{v8.2d,v9.2d},[x7],#32
    264  1.1  christos 	st1	{v10.2d,v11.2d},[x7],#32
    265  1.1  christos 	st1	{v12.2d,v13.2d},[x7],#32
    266  1.1  christos 	bne	.LNEON_8n_init
    267  1.1  christos 
    268  1.1  christos 	add	x6,sp,#256
    269  1.1  christos 	ld1	{v0.4s,v1.4s},[x1],#32
    270  1.1  christos 	add	x10,sp,#8
    271  1.1  christos 	ldr	s30,[x4],#4
    272  1.1  christos 	mov	x9,x5
    273  1.1  christos 	b	.LNEON_8n_outer
    274  1.1  christos 
    275  1.1  christos .align	4
    276  1.1  christos .LNEON_8n_outer:
    277  1.1  christos 	ldr	s28,[x2],#4   // *b++
    278  1.1  christos 	uxtl	v28.4s,v28.4h
    279  1.1  christos 	add	x7,sp,#128
    280  1.1  christos 	ld1	{v2.4s,v3.4s},[x3],#32
    281  1.1  christos 
    282  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[0]
    283  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[1]
    284  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[2]
    285  1.1  christos 	shl	v29.2d,v6.2d,#16
    286  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    287  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[3]
    288  1.1  christos 	add	v29.2d,v29.2d,v6.2d
    289  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[0]
    290  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    291  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[1]
    292  1.1  christos 	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
    293  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[2]
    294  1.1  christos 	uxtl	v29.4s,v29.4h
    295  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[3]
    296  1.1  christos 	ldr	s28,[x2],#4   // *b++
    297  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[0]
    298  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[1]
    299  1.1  christos 	uxtl	v28.4s,v28.4h
    300  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[2]
    301  1.1  christos 	ushr	v15.2d,v6.2d,#16
    302  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[3]
    303  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[0]
    304  1.1  christos 	ext	v6.16b,v6.16b,v6.16b,#8
    305  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    306  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[1]
    307  1.1  christos 	ushr	v6.2d,v6.2d,#16
    308  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[2]
    309  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[3]
    310  1.1  christos 	add	v16.2d,v7.2d,v6.2d
    311  1.1  christos 	ins	v7.d[0],v16.d[0]
    312  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
    313  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[0]
    314  1.1  christos 	ld1	{v6.2d},[x6],#16
    315  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[1]
    316  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[2]
    317  1.1  christos 	shl	v29.2d,v7.2d,#16
    318  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    319  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[3]
    320  1.1  christos 	add	v29.2d,v29.2d,v7.2d
    321  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[0]
    322  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    323  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[1]
    324  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
    325  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[2]
    326  1.1  christos 	uxtl	v29.4s,v29.4h
    327  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[3]
    328  1.1  christos 	ldr	s28,[x2],#4   // *b++
    329  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[0]
    330  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[1]
    331  1.1  christos 	uxtl	v28.4s,v28.4h
    332  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[2]
    333  1.1  christos 	ushr	v15.2d,v7.2d,#16
    334  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[3]
    335  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[0]
    336  1.1  christos 	ext	v7.16b,v7.16b,v7.16b,#8
    337  1.1  christos 	add	v7.2d,v7.2d,v15.2d
    338  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[1]
    339  1.1  christos 	ushr	v7.2d,v7.2d,#16
    340  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[2]
    341  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[3]
    342  1.1  christos 	add	v16.2d,v8.2d,v7.2d
    343  1.1  christos 	ins	v8.d[0],v16.d[0]
    344  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
    345  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[0]
    346  1.1  christos 	ld1	{v7.2d},[x6],#16
    347  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[1]
    348  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[2]
    349  1.1  christos 	shl	v29.2d,v8.2d,#16
    350  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    351  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[3]
    352  1.1  christos 	add	v29.2d,v29.2d,v8.2d
    353  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[0]
    354  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    355  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[1]
    356  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
    357  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[2]
    358  1.1  christos 	uxtl	v29.4s,v29.4h
    359  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[3]
    360  1.1  christos 	ldr	s28,[x2],#4   // *b++
    361  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[0]
    362  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[1]
    363  1.1  christos 	uxtl	v28.4s,v28.4h
    364  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[2]
    365  1.1  christos 	ushr	v15.2d,v8.2d,#16
    366  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[3]
    367  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[0]
    368  1.1  christos 	ext	v8.16b,v8.16b,v8.16b,#8
    369  1.1  christos 	add	v8.2d,v8.2d,v15.2d
    370  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[1]
    371  1.1  christos 	ushr	v8.2d,v8.2d,#16
    372  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[2]
    373  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[3]
    374  1.1  christos 	add	v16.2d,v9.2d,v8.2d
    375  1.1  christos 	ins	v9.d[0],v16.d[0]
    376  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
    377  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[0]
    378  1.1  christos 	ld1	{v8.2d},[x6],#16
    379  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[1]
    380  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[2]
    381  1.1  christos 	shl	v29.2d,v9.2d,#16
    382  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    383  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[3]
    384  1.1  christos 	add	v29.2d,v29.2d,v9.2d
    385  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[0]
    386  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    387  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[1]
    388  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
    389  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[2]
    390  1.1  christos 	uxtl	v29.4s,v29.4h
    391  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[3]
    392  1.1  christos 	ldr	s28,[x2],#4   // *b++
    393  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[0]
    394  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[1]
    395  1.1  christos 	uxtl	v28.4s,v28.4h
    396  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[2]
    397  1.1  christos 	ushr	v15.2d,v9.2d,#16
    398  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[3]
    399  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[0]
    400  1.1  christos 	ext	v9.16b,v9.16b,v9.16b,#8
    401  1.1  christos 	add	v9.2d,v9.2d,v15.2d
    402  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[1]
    403  1.1  christos 	ushr	v9.2d,v9.2d,#16
    404  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[2]
    405  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[3]
    406  1.1  christos 	add	v16.2d,v10.2d,v9.2d
    407  1.1  christos 	ins	v10.d[0],v16.d[0]
    408  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
    409  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[0]
    410  1.1  christos 	ld1	{v9.2d},[x6],#16
    411  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[1]
    412  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[2]
    413  1.1  christos 	shl	v29.2d,v10.2d,#16
    414  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    415  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[3]
    416  1.1  christos 	add	v29.2d,v29.2d,v10.2d
    417  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[0]
    418  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    419  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[1]
    420  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
    421  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[2]
    422  1.1  christos 	uxtl	v29.4s,v29.4h
    423  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[3]
    424  1.1  christos 	ldr	s28,[x2],#4   // *b++
    425  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[0]
    426  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[1]
    427  1.1  christos 	uxtl	v28.4s,v28.4h
    428  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[2]
    429  1.1  christos 	ushr	v15.2d,v10.2d,#16
    430  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[3]
    431  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[0]
    432  1.1  christos 	ext	v10.16b,v10.16b,v10.16b,#8
    433  1.1  christos 	add	v10.2d,v10.2d,v15.2d
    434  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[1]
    435  1.1  christos 	ushr	v10.2d,v10.2d,#16
    436  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[2]
    437  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[3]
    438  1.1  christos 	add	v16.2d,v11.2d,v10.2d
    439  1.1  christos 	ins	v11.d[0],v16.d[0]
    440  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
    441  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[0]
    442  1.1  christos 	ld1	{v10.2d},[x6],#16
    443  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[1]
    444  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[2]
    445  1.1  christos 	shl	v29.2d,v11.2d,#16
    446  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    447  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[3]
    448  1.1  christos 	add	v29.2d,v29.2d,v11.2d
    449  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[0]
    450  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    451  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[1]
    452  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
    453  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[2]
    454  1.1  christos 	uxtl	v29.4s,v29.4h
    455  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[3]
    456  1.1  christos 	ldr	s28,[x2],#4   // *b++
    457  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[0]
    458  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[1]
    459  1.1  christos 	uxtl	v28.4s,v28.4h
    460  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[2]
    461  1.1  christos 	ushr	v15.2d,v11.2d,#16
    462  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[3]
    463  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[0]
    464  1.1  christos 	ext	v11.16b,v11.16b,v11.16b,#8
    465  1.1  christos 	add	v11.2d,v11.2d,v15.2d
    466  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[1]
    467  1.1  christos 	ushr	v11.2d,v11.2d,#16
    468  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[2]
    469  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[3]
    470  1.1  christos 	add	v16.2d,v12.2d,v11.2d
    471  1.1  christos 	ins	v12.d[0],v16.d[0]
    472  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
    473  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[0]
    474  1.1  christos 	ld1	{v11.2d},[x6],#16
    475  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[1]
    476  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[2]
    477  1.1  christos 	shl	v29.2d,v12.2d,#16
    478  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    479  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[3]
    480  1.1  christos 	add	v29.2d,v29.2d,v12.2d
    481  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[0]
    482  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    483  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[1]
    484  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
    485  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[2]
    486  1.1  christos 	uxtl	v29.4s,v29.4h
    487  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[3]
    488  1.1  christos 	ldr	s28,[x2],#4   // *b++
    489  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[0]
    490  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[1]
    491  1.1  christos 	uxtl	v28.4s,v28.4h
    492  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[2]
    493  1.1  christos 	ushr	v15.2d,v12.2d,#16
    494  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[3]
    495  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[0]
    496  1.1  christos 	ext	v12.16b,v12.16b,v12.16b,#8
    497  1.1  christos 	add	v12.2d,v12.2d,v15.2d
    498  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[1]
    499  1.1  christos 	ushr	v12.2d,v12.2d,#16
    500  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[2]
    501  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[3]
    502  1.1  christos 	add	v16.2d,v13.2d,v12.2d
    503  1.1  christos 	ins	v13.d[0],v16.d[0]
    504  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
    505  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[0]
    506  1.1  christos 	ld1	{v12.2d},[x6],#16
    507  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[1]
    508  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[2]
    509  1.1  christos 	shl	v29.2d,v13.2d,#16
    510  1.1  christos 	ext	v29.16b,v29.16b,v29.16b,#8
    511  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[3]
    512  1.1  christos 	add	v29.2d,v29.2d,v13.2d
    513  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[0]
    514  1.1  christos 	mul	v29.2s,v29.2s,v30.2s
    515  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[1]
    516  1.1  christos 	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
    517  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[2]
    518  1.1  christos 	uxtl	v29.4s,v29.4h
    519  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[3]
    520  1.1  christos 	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
    521  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[0]
    522  1.1  christos 	ld1	{v0.4s,v1.4s},[x1],#32
    523  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[1]
    524  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[2]
    525  1.1  christos 	mov	v5.16b,v13.16b
    526  1.1  christos 	ushr	v5.2d,v5.2d,#16
    527  1.1  christos 	ext	v13.16b,v13.16b,v13.16b,#8
    528  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[3]
    529  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[0]
    530  1.1  christos 	add	v13.2d,v13.2d,v5.2d
    531  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[1]
    532  1.1  christos 	ushr	v13.2d,v13.2d,#16
    533  1.1  christos 	eor	v15.16b,v15.16b,v15.16b
    534  1.1  christos 	ins	v13.d[1],v15.d[0]
    535  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[2]
    536  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[3]
    537  1.1  christos 	add	v6.2d,v6.2d,v13.2d
    538  1.1  christos 	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
    539  1.1  christos 	add	x10,sp,#8		// rewind
    540  1.1  christos 	sub	x8,x5,#8
    541  1.1  christos 	b	.LNEON_8n_inner
    542  1.1  christos 
    543  1.1  christos .align	4
    544  1.1  christos .LNEON_8n_inner:
    545  1.1  christos 	subs	x8,x8,#8
    546  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[0]
    547  1.1  christos 	ld1	{v13.2d},[x6]
    548  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[1]
    549  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
    550  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[2]
    551  1.1  christos 	ld1	{v2.4s,v3.4s},[x3],#32
    552  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[3]
    553  1.1  christos 	b.eq	.LInner_jump
    554  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    555  1.1  christos .LInner_jump:
    556  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[0]
    557  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[1]
    558  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[2]
    559  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[3]
    560  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
    561  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[0]
    562  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[1]
    563  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[2]
    564  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[3]
    565  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[0]
    566  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[1]
    567  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[2]
    568  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[3]
    569  1.1  christos 	st1	{v6.2d},[x7],#16
    570  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[0]
    571  1.1  christos 	ld1	{v6.2d},[x6]
    572  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[1]
    573  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
    574  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[2]
    575  1.1  christos 	b.eq	.LInner_jump1
    576  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    577  1.1  christos .LInner_jump1:
    578  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[3]
    579  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[0]
    580  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[1]
    581  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[2]
    582  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[3]
    583  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
    584  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[0]
    585  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[1]
    586  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[2]
    587  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[3]
    588  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[0]
    589  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[1]
    590  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[2]
    591  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[3]
    592  1.1  christos 	st1	{v7.2d},[x7],#16
    593  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[0]
    594  1.1  christos 	ld1	{v7.2d},[x6]
    595  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[1]
    596  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
    597  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[2]
    598  1.1  christos 	b.eq	.LInner_jump2
    599  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    600  1.1  christos .LInner_jump2:
    601  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[3]
    602  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[0]
    603  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[1]
    604  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[2]
    605  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[3]
    606  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
    607  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[0]
    608  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[1]
    609  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[2]
    610  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[3]
    611  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[0]
    612  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[1]
    613  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[2]
    614  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[3]
    615  1.1  christos 	st1	{v8.2d},[x7],#16
    616  1.1  christos 	umlal	v9.2d,v28.2s,v0.s[0]
    617  1.1  christos 	ld1	{v8.2d},[x6]
    618  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[1]
    619  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
    620  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[2]
    621  1.1  christos 	b.eq	.LInner_jump3
    622  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    623  1.1  christos .LInner_jump3:
    624  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[3]
    625  1.1  christos 	umlal	v13.2d,v28.2s,v1.s[0]
    626  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[1]
    627  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[2]
    628  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[3]
    629  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
    630  1.1  christos 	umlal	v9.2d,v29.2s,v2.s[0]
    631  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[1]
    632  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[2]
    633  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[3]
    634  1.1  christos 	umlal	v13.2d,v29.2s,v3.s[0]
    635  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[1]
    636  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[2]
    637  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[3]
    638  1.1  christos 	st1	{v9.2d},[x7],#16
    639  1.1  christos 	umlal	v10.2d,v28.2s,v0.s[0]
    640  1.1  christos 	ld1	{v9.2d},[x6]
    641  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[1]
    642  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
    643  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[2]
    644  1.1  christos 	b.eq	.LInner_jump4
    645  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    646  1.1  christos .LInner_jump4:
    647  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[3]
    648  1.1  christos 	umlal	v6.2d,v28.2s,v1.s[0]
    649  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[1]
    650  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[2]
    651  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[3]
    652  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
    653  1.1  christos 	umlal	v10.2d,v29.2s,v2.s[0]
    654  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[1]
    655  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[2]
    656  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[3]
    657  1.1  christos 	umlal	v6.2d,v29.2s,v3.s[0]
    658  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[1]
    659  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[2]
    660  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[3]
    661  1.1  christos 	st1	{v10.2d},[x7],#16
    662  1.1  christos 	umlal	v11.2d,v28.2s,v0.s[0]
    663  1.1  christos 	ld1	{v10.2d},[x6]
    664  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[1]
    665  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
    666  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[2]
    667  1.1  christos 	b.eq	.LInner_jump5
    668  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    669  1.1  christos .LInner_jump5:
    670  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[3]
    671  1.1  christos 	umlal	v7.2d,v28.2s,v1.s[0]
    672  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[1]
    673  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[2]
    674  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[3]
    675  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
    676  1.1  christos 	umlal	v11.2d,v29.2s,v2.s[0]
    677  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[1]
    678  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[2]
    679  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[3]
    680  1.1  christos 	umlal	v7.2d,v29.2s,v3.s[0]
    681  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[1]
    682  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[2]
    683  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[3]
    684  1.1  christos 	st1	{v11.2d},[x7],#16
    685  1.1  christos 	umlal	v12.2d,v28.2s,v0.s[0]
    686  1.1  christos 	ld1	{v11.2d},[x6]
    687  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[1]
    688  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
    689  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[2]
    690  1.1  christos 	b.eq	.LInner_jump6
    691  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    692  1.1  christos .LInner_jump6:
    693  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[3]
    694  1.1  christos 	umlal	v8.2d,v28.2s,v1.s[0]
    695  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[1]
    696  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[2]
    697  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[3]
    698  1.1  christos 	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
    699  1.1  christos 	umlal	v12.2d,v29.2s,v2.s[0]
    700  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[1]
    701  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[2]
    702  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[3]
    703  1.1  christos 	umlal	v8.2d,v29.2s,v3.s[0]
    704  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[1]
    705  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[2]
    706  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[3]
    707  1.1  christos 	st1	{v12.2d},[x7],#16
    708  1.1  christos 	umlal	v13.2d,v28.2s,v0.s[0]
    709  1.1  christos 	ld1	{v12.2d},[x6]
    710  1.1  christos 	umlal	v6.2d,v28.2s,v0.s[1]
    711  1.1  christos 	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
    712  1.1  christos 	umlal	v7.2d,v28.2s,v0.s[2]
    713  1.1  christos 	b.eq	.LInner_jump7
    714  1.1  christos 	add	x6,x6,#16	// don't advance in last iteration
    715  1.1  christos .LInner_jump7:
    716  1.1  christos 	umlal	v8.2d,v28.2s,v0.s[3]
    717  1.1  christos 	umlal	v9.2d,v28.2s,v1.s[0]
    718  1.1  christos 	umlal	v10.2d,v28.2s,v1.s[1]
    719  1.1  christos 	umlal	v11.2d,v28.2s,v1.s[2]
    720  1.1  christos 	umlal	v12.2d,v28.2s,v1.s[3]
    721  1.1  christos 	b.ne	.LInner_after_rewind8
    722  1.1  christos 	sub	x1,x1,x5,lsl#2	// rewind
    723  1.1  christos .LInner_after_rewind8:
    724  1.1  christos 	umlal	v13.2d,v29.2s,v2.s[0]
    725  1.1  christos 	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
    726  1.1  christos 	umlal	v6.2d,v29.2s,v2.s[1]
    727  1.1  christos 	ld1	{v0.4s,v1.4s},[x1],#32
    728  1.1  christos 	umlal	v7.2d,v29.2s,v2.s[2]
    729  1.1  christos 	add	x10,sp,#8		// rewind
    730  1.1  christos 	umlal	v8.2d,v29.2s,v2.s[3]
    731  1.1  christos 	umlal	v9.2d,v29.2s,v3.s[0]
    732  1.1  christos 	umlal	v10.2d,v29.2s,v3.s[1]
    733  1.1  christos 	umlal	v11.2d,v29.2s,v3.s[2]
    734  1.1  christos 	st1	{v13.2d},[x7],#16
    735  1.1  christos 	umlal	v12.2d,v29.2s,v3.s[3]
    736  1.1  christos 
    737  1.1  christos 	bne	.LNEON_8n_inner
    738  1.1  christos 	add	x6,sp,#128
    739  1.1  christos 	st1	{v6.2d,v7.2d},[x7],#32
    740  1.1  christos 	eor	v2.16b,v2.16b,v2.16b	// v2
    741  1.1  christos 	st1	{v8.2d,v9.2d},[x7],#32
    742  1.1  christos 	eor	v3.16b,v3.16b,v3.16b	// v3
    743  1.1  christos 	st1	{v10.2d,v11.2d},[x7],#32
    744  1.1  christos 	st1	{v12.2d},[x7]
    745  1.1  christos 
    746  1.1  christos 	subs	x9,x9,#8
    747  1.1  christos 	ld1	{v6.2d,v7.2d},[x6],#32
    748  1.1  christos 	ld1	{v8.2d,v9.2d},[x6],#32
    749  1.1  christos 	ld1	{v10.2d,v11.2d},[x6],#32
    750  1.1  christos 	ld1	{v12.2d,v13.2d},[x6],#32
    751  1.1  christos 
    752  1.1  christos 	b.eq	.LInner_8n_jump_2steps
    753  1.1  christos 	sub	x3,x3,x5,lsl#2	// rewind
    754  1.1  christos 	b	.LNEON_8n_outer
    755  1.1  christos 
    756  1.1  christos .LInner_8n_jump_2steps:
    757  1.1  christos 	add	x7,sp,#128
    758  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
    759  1.1  christos 	mov	v5.16b,v6.16b
    760  1.1  christos 	ushr	v15.2d,v6.2d,#16
    761  1.1  christos 	ext	v6.16b,v6.16b,v6.16b,#8
    762  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32
    763  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    764  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32
    765  1.1  christos 	ushr	v15.2d,v6.2d,#16
    766  1.1  christos 	st1	{v2.2d,v3.2d}, [sp],#32
    767  1.1  christos 	zip1	v6.4h,v5.4h,v6.4h
    768  1.1  christos 	ins	v15.d[1],v14.d[0]
    769  1.1  christos 
    770  1.1  christos 	mov	x8,x5
    771  1.1  christos 	b	.LNEON_tail_entry
    772  1.1  christos 
    773  1.1  christos .align	4
    774  1.1  christos .LNEON_tail:
    775  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    776  1.1  christos 	mov	v5.16b,v6.16b
    777  1.1  christos 	ushr	v15.2d,v6.2d,#16
    778  1.1  christos 	ext	v6.16b,v6.16b,v6.16b,#8
    779  1.1  christos 	ld1	{v8.2d,v9.2d}, [x6],#32
    780  1.1  christos 	add	v6.2d,v6.2d,v15.2d
    781  1.1  christos 	ld1	{v10.2d,v11.2d}, [x6],#32
    782  1.1  christos 	ushr	v15.2d,v6.2d,#16
    783  1.1  christos 	ld1	{v12.2d,v13.2d}, [x6],#32
    784  1.1  christos 	zip1	v6.4h,v5.4h,v6.4h
    785  1.1  christos 	ins	v15.d[1],v14.d[0]
    786  1.1  christos 
    787  1.1  christos .LNEON_tail_entry:
    788  1.1  christos 	add	v7.2d,v7.2d,v15.2d
    789  1.1  christos 	st1	{v6.s}[0], [x7],#4
    790  1.1  christos 	ushr	v15.2d,v7.2d,#16
    791  1.1  christos 	mov	v5.16b,v7.16b
    792  1.1  christos 	ext	v7.16b,v7.16b,v7.16b,#8
    793  1.1  christos 	add	v7.2d,v7.2d,v15.2d
    794  1.1  christos 	ushr	v15.2d,v7.2d,#16
    795  1.1  christos 	zip1	v7.4h,v5.4h,v7.4h
    796  1.1  christos 	ins	v15.d[1],v14.d[0]
    797  1.1  christos 	add	v8.2d,v8.2d,v15.2d
    798  1.1  christos 	st1	{v7.s}[0], [x7],#4
    799  1.1  christos 	ushr	v15.2d,v8.2d,#16
    800  1.1  christos 	mov	v5.16b,v8.16b
    801  1.1  christos 	ext	v8.16b,v8.16b,v8.16b,#8
    802  1.1  christos 	add	v8.2d,v8.2d,v15.2d
    803  1.1  christos 	ushr	v15.2d,v8.2d,#16
    804  1.1  christos 	zip1	v8.4h,v5.4h,v8.4h
    805  1.1  christos 	ins	v15.d[1],v14.d[0]
    806  1.1  christos 	add	v9.2d,v9.2d,v15.2d
    807  1.1  christos 	st1	{v8.s}[0], [x7],#4
    808  1.1  christos 	ushr	v15.2d,v9.2d,#16
    809  1.1  christos 	mov	v5.16b,v9.16b
    810  1.1  christos 	ext	v9.16b,v9.16b,v9.16b,#8
    811  1.1  christos 	add	v9.2d,v9.2d,v15.2d
    812  1.1  christos 	ushr	v15.2d,v9.2d,#16
    813  1.1  christos 	zip1	v9.4h,v5.4h,v9.4h
    814  1.1  christos 	ins	v15.d[1],v14.d[0]
    815  1.1  christos 	add	v10.2d,v10.2d,v15.2d
    816  1.1  christos 	st1	{v9.s}[0], [x7],#4
    817  1.1  christos 	ushr	v15.2d,v10.2d,#16
    818  1.1  christos 	mov	v5.16b,v10.16b
    819  1.1  christos 	ext	v10.16b,v10.16b,v10.16b,#8
    820  1.1  christos 	add	v10.2d,v10.2d,v15.2d
    821  1.1  christos 	ushr	v15.2d,v10.2d,#16
    822  1.1  christos 	zip1	v10.4h,v5.4h,v10.4h
    823  1.1  christos 	ins	v15.d[1],v14.d[0]
    824  1.1  christos 	add	v11.2d,v11.2d,v15.2d
    825  1.1  christos 	st1	{v10.s}[0], [x7],#4
    826  1.1  christos 	ushr	v15.2d,v11.2d,#16
    827  1.1  christos 	mov	v5.16b,v11.16b
    828  1.1  christos 	ext	v11.16b,v11.16b,v11.16b,#8
    829  1.1  christos 	add	v11.2d,v11.2d,v15.2d
    830  1.1  christos 	ushr	v15.2d,v11.2d,#16
    831  1.1  christos 	zip1	v11.4h,v5.4h,v11.4h
    832  1.1  christos 	ins	v15.d[1],v14.d[0]
    833  1.1  christos 	add	v12.2d,v12.2d,v15.2d
    834  1.1  christos 	st1	{v11.s}[0], [x7],#4
    835  1.1  christos 	ushr	v15.2d,v12.2d,#16
    836  1.1  christos 	mov	v5.16b,v12.16b
    837  1.1  christos 	ext	v12.16b,v12.16b,v12.16b,#8
    838  1.1  christos 	add	v12.2d,v12.2d,v15.2d
    839  1.1  christos 	ushr	v15.2d,v12.2d,#16
    840  1.1  christos 	zip1	v12.4h,v5.4h,v12.4h
    841  1.1  christos 	ins	v15.d[1],v14.d[0]
    842  1.1  christos 	add	v13.2d,v13.2d,v15.2d
    843  1.1  christos 	st1	{v12.s}[0], [x7],#4
    844  1.1  christos 	ushr	v15.2d,v13.2d,#16
    845  1.1  christos 	mov	v5.16b,v13.16b
    846  1.1  christos 	ext	v13.16b,v13.16b,v13.16b,#8
    847  1.1  christos 	add	v13.2d,v13.2d,v15.2d
    848  1.1  christos 	ushr	v15.2d,v13.2d,#16
    849  1.1  christos 	zip1	v13.4h,v5.4h,v13.4h
    850  1.1  christos 	ins	v15.d[1],v14.d[0]
    851  1.1  christos 	ld1	{v6.2d,v7.2d}, [x6],#32
    852  1.1  christos 	subs	x8,x8,#8
    853  1.1  christos 	st1	{v13.s}[0], [x7],#4
    854  1.1  christos 	bne	.LNEON_tail
    855  1.1  christos 
    856  1.1  christos 	st1	{v15.s}[0], [x7],#4	// top-most bit
    857  1.1  christos 	sub	x3,x3,x5,lsl#2		// rewind x3
    858  1.1  christos 	subs	x1,sp,#0			// clear carry flag
    859  1.1  christos 	add	x2,sp,x5,lsl#2
    860  1.1  christos 
    861  1.1  christos .LNEON_sub:
    862  1.1  christos 	ldp	w4,w5,[x1],#8
    863  1.1  christos 	ldp	w6,w7,[x1],#8
    864  1.1  christos 	ldp	w8,w9,[x3],#8
    865  1.1  christos 	ldp	w10,w11,[x3],#8
    866  1.1  christos 	sbcs	w8,w4,w8
    867  1.1  christos 	sbcs	w9,w5,w9
    868  1.1  christos 	sbcs	w10,w6,w10
    869  1.1  christos 	sbcs	w11,w7,w11
    870  1.1  christos 	sub	x17,x2,x1
    871  1.1  christos 	stp	w8,w9,[x0],#8
    872  1.1  christos 	stp	w10,w11,[x0],#8
    873  1.1  christos 	cbnz	x17,.LNEON_sub
    874  1.1  christos 
    875  1.1  christos 	ldr	w10, [x1]		// load top-most bit
    876  1.1  christos 	mov	x11,sp
    877  1.1  christos 	eor	v0.16b,v0.16b,v0.16b
    878  1.1  christos 	sub	x11,x2,x11		// this is num*4
    879  1.1  christos 	eor	v1.16b,v1.16b,v1.16b
    880  1.1  christos 	mov	x1,sp
    881  1.1  christos 	sub	x0,x0,x11		// rewind x0
    882  1.1  christos 	mov	x3,x2		// second 3/4th of frame
    883  1.1  christos 	sbcs	w10,w10,wzr		// result is carry flag
    884  1.1  christos 
    885  1.1  christos .LNEON_copy_n_zap:
    886  1.1  christos 	ldp	w4,w5,[x1],#8
    887  1.1  christos 	ldp	w6,w7,[x1],#8
    888  1.1  christos 	ldp	w8,w9,[x0],#8
    889  1.1  christos 	ldp	w10,w11,[x0]
    890  1.1  christos 	sub	x0,x0,#8
    891  1.1  christos 	b.cs	.LCopy_1
    892  1.1  christos 	mov	w8,w4
    893  1.1  christos 	mov	w9,w5
    894  1.1  christos 	mov	w10,w6
    895  1.1  christos 	mov	w11,w7
    896  1.1  christos .LCopy_1:
    897  1.1  christos 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
    898  1.1  christos 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
    899  1.1  christos 	ldp	w4,w5,[x1],#8
    900  1.1  christos 	ldp	w6,w7,[x1],#8
    901  1.1  christos 	stp	w8,w9,[x0],#8
    902  1.1  christos 	stp	w10,w11,[x0],#8
    903  1.1  christos 	sub	x1,x1,#32
    904  1.1  christos 	ldp	w8,w9,[x0],#8
    905  1.1  christos 	ldp	w10,w11,[x0]
    906  1.1  christos 	sub	x0,x0,#8
    907  1.1  christos 	b.cs	.LCopy_2
    908  1.1  christos 	mov	w8, w4
    909  1.1  christos 	mov	w9, w5
    910  1.1  christos 	mov	w10, w6
    911  1.1  christos 	mov	w11, w7
    912  1.1  christos .LCopy_2:
    913  1.1  christos 	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
    914  1.1  christos 	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
    915  1.1  christos 	sub	x17,x2,x1		// preserves carry
    916  1.1  christos 	stp	w8,w9,[x0],#8
    917  1.1  christos 	stp	w10,w11,[x0],#8
    918  1.1  christos 	cbnz	x17,.LNEON_copy_n_zap
    919  1.1  christos 
    920  1.1  christos 	mov	sp,x16
    921  1.1  christos 	ldp	d14,d15,[sp,#64]
    922  1.1  christos 	ldp	d12,d13,[sp,#48]
    923  1.1  christos 	ldp	d10,d11,[sp,#32]
    924  1.1  christos 	ldp	d8,d9,[sp,#16]
    925  1.1  christos 	ldr	x29,[sp],#80
    926  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
    927  1.1  christos 	ret	// RET
    928  1.1  christos 
    929  1.1  christos .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
    930  1.1  christos .type	__bn_sqr8x_mont,%function
    931  1.1  christos .align	5
    932  1.1  christos __bn_sqr8x_mont:
    933  1.1  christos 	cmp	x1,x2
    934  1.1  christos 	b.ne	__bn_mul4x_mont
    935  1.1  christos .Lsqr8x_mont:
    936  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
    937  1.2  christos 	// only from bn_mul_mont which has already signed the return address.
    938  1.1  christos 	stp	x29,x30,[sp,#-128]!
    939  1.1  christos 	add	x29,sp,#0
    940  1.1  christos 	stp	x19,x20,[sp,#16]
    941  1.1  christos 	stp	x21,x22,[sp,#32]
    942  1.1  christos 	stp	x23,x24,[sp,#48]
    943  1.1  christos 	stp	x25,x26,[sp,#64]
    944  1.1  christos 	stp	x27,x28,[sp,#80]
    945  1.1  christos 	stp	x0,x3,[sp,#96]	// offload rp and np
    946  1.1  christos 
    947  1.1  christos 	ldp	x6,x7,[x1,#8*0]
    948  1.1  christos 	ldp	x8,x9,[x1,#8*2]
    949  1.1  christos 	ldp	x10,x11,[x1,#8*4]
    950  1.1  christos 	ldp	x12,x13,[x1,#8*6]
    951  1.1  christos 
    952  1.1  christos 	sub	x2,sp,x5,lsl#4
    953  1.1  christos 	lsl	x5,x5,#3
    954  1.1  christos 	ldr	x4,[x4]		// *n0
    955  1.1  christos 	mov	sp,x2			// alloca
    956  1.1  christos 	sub	x27,x5,#8*8
    957  1.1  christos 	b	.Lsqr8x_zero_start
    958  1.1  christos 
    959  1.1  christos .Lsqr8x_zero:
    960  1.1  christos 	sub	x27,x27,#8*8
    961  1.1  christos 	stp	xzr,xzr,[x2,#8*0]
    962  1.1  christos 	stp	xzr,xzr,[x2,#8*2]
    963  1.1  christos 	stp	xzr,xzr,[x2,#8*4]
    964  1.1  christos 	stp	xzr,xzr,[x2,#8*6]
    965  1.1  christos .Lsqr8x_zero_start:
    966  1.1  christos 	stp	xzr,xzr,[x2,#8*8]
    967  1.1  christos 	stp	xzr,xzr,[x2,#8*10]
    968  1.1  christos 	stp	xzr,xzr,[x2,#8*12]
    969  1.1  christos 	stp	xzr,xzr,[x2,#8*14]
    970  1.1  christos 	add	x2,x2,#8*16
    971  1.1  christos 	cbnz	x27,.Lsqr8x_zero
    972  1.1  christos 
    973  1.1  christos 	add	x3,x1,x5
    974  1.1  christos 	add	x1,x1,#8*8
    975  1.1  christos 	mov	x19,xzr
    976  1.1  christos 	mov	x20,xzr
    977  1.1  christos 	mov	x21,xzr
    978  1.1  christos 	mov	x22,xzr
    979  1.1  christos 	mov	x23,xzr
    980  1.1  christos 	mov	x24,xzr
    981  1.1  christos 	mov	x25,xzr
    982  1.1  christos 	mov	x26,xzr
    983  1.1  christos 	mov	x2,sp
    984  1.1  christos 	str	x4,[x29,#112]		// offload n0
    985  1.1  christos 
    986  1.1  christos 	// Multiply everything but a[i]*a[i]
    987  1.1  christos .align	4
    988  1.1  christos .Lsqr8x_outer_loop:
    989  1.1  christos         //                                                 a[1]a[0]	(i)
    990  1.1  christos         //                                             a[2]a[0]
    991  1.1  christos         //                                         a[3]a[0]
    992  1.1  christos         //                                     a[4]a[0]
    993  1.1  christos         //                                 a[5]a[0]
    994  1.1  christos         //                             a[6]a[0]
    995  1.1  christos         //                         a[7]a[0]
    996  1.1  christos         //                                         a[2]a[1]		(ii)
    997  1.1  christos         //                                     a[3]a[1]
    998  1.1  christos         //                                 a[4]a[1]
    999  1.1  christos         //                             a[5]a[1]
   1000  1.1  christos         //                         a[6]a[1]
   1001  1.1  christos         //                     a[7]a[1]
   1002  1.1  christos         //                                 a[3]a[2]			(iii)
   1003  1.1  christos         //                             a[4]a[2]
   1004  1.1  christos         //                         a[5]a[2]
   1005  1.1  christos         //                     a[6]a[2]
   1006  1.1  christos         //                 a[7]a[2]
   1007  1.1  christos         //                         a[4]a[3]				(iv)
   1008  1.1  christos         //                     a[5]a[3]
   1009  1.1  christos         //                 a[6]a[3]
   1010  1.1  christos         //             a[7]a[3]
   1011  1.1  christos         //                 a[5]a[4]					(v)
   1012  1.1  christos         //             a[6]a[4]
   1013  1.1  christos         //         a[7]a[4]
   1014  1.1  christos         //         a[6]a[5]						(vi)
   1015  1.1  christos         //     a[7]a[5]
   1016  1.1  christos         // a[7]a[6]							(vii)
   1017  1.1  christos 
   1018  1.1  christos 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
   1019  1.1  christos 	mul	x15,x8,x6
   1020  1.1  christos 	mul	x16,x9,x6
   1021  1.1  christos 	mul	x17,x10,x6
   1022  1.1  christos 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
   1023  1.1  christos 	mul	x14,x11,x6
   1024  1.1  christos 	adcs	x21,x21,x15
   1025  1.1  christos 	mul	x15,x12,x6
   1026  1.1  christos 	adcs	x22,x22,x16
   1027  1.1  christos 	mul	x16,x13,x6
   1028  1.1  christos 	adcs	x23,x23,x17
   1029  1.1  christos 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
   1030  1.1  christos 	adcs	x24,x24,x14
   1031  1.1  christos 	umulh	x14,x8,x6
   1032  1.1  christos 	adcs	x25,x25,x15
   1033  1.1  christos 	umulh	x15,x9,x6
   1034  1.1  christos 	adcs	x26,x26,x16
   1035  1.1  christos 	umulh	x16,x10,x6
   1036  1.1  christos 	stp	x19,x20,[x2],#8*2	// t[0..1]
   1037  1.1  christos 	adc	x19,xzr,xzr		// t[8]
   1038  1.1  christos 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
   1039  1.1  christos 	umulh	x17,x11,x6
   1040  1.1  christos 	adcs	x22,x22,x14
   1041  1.1  christos 	umulh	x14,x12,x6
   1042  1.1  christos 	adcs	x23,x23,x15
   1043  1.1  christos 	umulh	x15,x13,x6
   1044  1.1  christos 	adcs	x24,x24,x16
   1045  1.1  christos 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
   1046  1.1  christos 	adcs	x25,x25,x17
   1047  1.1  christos 	mul	x17,x9,x7
   1048  1.1  christos 	adcs	x26,x26,x14
   1049  1.1  christos 	mul	x14,x10,x7
   1050  1.1  christos 	adc	x19,x19,x15
   1051  1.1  christos 
   1052  1.1  christos 	mul	x15,x11,x7
   1053  1.1  christos 	adds	x22,x22,x16
   1054  1.1  christos 	mul	x16,x12,x7
   1055  1.1  christos 	adcs	x23,x23,x17
   1056  1.1  christos 	mul	x17,x13,x7
   1057  1.1  christos 	adcs	x24,x24,x14
   1058  1.1  christos 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
   1059  1.1  christos 	adcs	x25,x25,x15
   1060  1.1  christos 	umulh	x15,x9,x7
   1061  1.1  christos 	adcs	x26,x26,x16
   1062  1.1  christos 	umulh	x16,x10,x7
   1063  1.1  christos 	adcs	x19,x19,x17
   1064  1.1  christos 	umulh	x17,x11,x7
   1065  1.1  christos 	stp	x21,x22,[x2],#8*2	// t[2..3]
   1066  1.1  christos 	adc	x20,xzr,xzr		// t[9]
   1067  1.1  christos 	adds	x23,x23,x14
   1068  1.1  christos 	umulh	x14,x12,x7
   1069  1.1  christos 	adcs	x24,x24,x15
   1070  1.1  christos 	umulh	x15,x13,x7
   1071  1.1  christos 	adcs	x25,x25,x16
   1072  1.1  christos 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
   1073  1.1  christos 	adcs	x26,x26,x17
   1074  1.1  christos 	mul	x17,x10,x8
   1075  1.1  christos 	adcs	x19,x19,x14
   1076  1.1  christos 	mul	x14,x11,x8
   1077  1.1  christos 	adc	x20,x20,x15
   1078  1.1  christos 
   1079  1.1  christos 	mul	x15,x12,x8
   1080  1.1  christos 	adds	x24,x24,x16
   1081  1.1  christos 	mul	x16,x13,x8
   1082  1.1  christos 	adcs	x25,x25,x17
   1083  1.1  christos 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
   1084  1.1  christos 	adcs	x26,x26,x14
   1085  1.1  christos 	umulh	x14,x10,x8
   1086  1.1  christos 	adcs	x19,x19,x15
   1087  1.1  christos 	umulh	x15,x11,x8
   1088  1.1  christos 	adcs	x20,x20,x16
   1089  1.1  christos 	umulh	x16,x12,x8
   1090  1.1  christos 	stp	x23,x24,[x2],#8*2	// t[4..5]
   1091  1.1  christos 	adc	x21,xzr,xzr		// t[10]
   1092  1.1  christos 	adds	x25,x25,x17
   1093  1.1  christos 	umulh	x17,x13,x8
   1094  1.1  christos 	adcs	x26,x26,x14
   1095  1.1  christos 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
   1096  1.1  christos 	adcs	x19,x19,x15
   1097  1.1  christos 	mul	x15,x11,x9
   1098  1.1  christos 	adcs	x20,x20,x16
   1099  1.1  christos 	mul	x16,x12,x9
   1100  1.1  christos 	adc	x21,x21,x17
   1101  1.1  christos 
   1102  1.1  christos 	mul	x17,x13,x9
   1103  1.1  christos 	adds	x26,x26,x14
   1104  1.1  christos 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
   1105  1.1  christos 	adcs	x19,x19,x15
   1106  1.1  christos 	umulh	x15,x11,x9
   1107  1.1  christos 	adcs	x20,x20,x16
   1108  1.1  christos 	umulh	x16,x12,x9
   1109  1.1  christos 	adcs	x21,x21,x17
   1110  1.1  christos 	umulh	x17,x13,x9
   1111  1.1  christos 	stp	x25,x26,[x2],#8*2	// t[6..7]
   1112  1.1  christos 	adc	x22,xzr,xzr		// t[11]
   1113  1.1  christos 	adds	x19,x19,x14
   1114  1.1  christos 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
   1115  1.1  christos 	adcs	x20,x20,x15
   1116  1.1  christos 	mul	x15,x12,x10
   1117  1.1  christos 	adcs	x21,x21,x16
   1118  1.1  christos 	mul	x16,x13,x10
   1119  1.1  christos 	adc	x22,x22,x17
   1120  1.1  christos 
   1121  1.1  christos 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
   1122  1.1  christos 	adds	x20,x20,x14
   1123  1.1  christos 	umulh	x14,x12,x10
   1124  1.1  christos 	adcs	x21,x21,x15
   1125  1.1  christos 	umulh	x15,x13,x10
   1126  1.1  christos 	adcs	x22,x22,x16
   1127  1.1  christos 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
   1128  1.1  christos 	adc	x23,xzr,xzr		// t[12]
   1129  1.1  christos 	adds	x21,x21,x17
   1130  1.1  christos 	mul	x17,x13,x11
   1131  1.1  christos 	adcs	x22,x22,x14
   1132  1.1  christos 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
   1133  1.1  christos 	adc	x23,x23,x15
   1134  1.1  christos 
   1135  1.1  christos 	umulh	x15,x13,x11
   1136  1.1  christos 	adds	x22,x22,x16
   1137  1.1  christos 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
   1138  1.1  christos 	adcs	x23,x23,x17
   1139  1.1  christos 	umulh	x17,x13,x12		// hi(a[7]*a[6])
   1140  1.1  christos 	adc	x24,xzr,xzr		// t[13]
   1141  1.1  christos 	adds	x23,x23,x14
   1142  1.1  christos 	sub	x27,x3,x1	// done yet?
   1143  1.1  christos 	adc	x24,x24,x15
   1144  1.1  christos 
   1145  1.1  christos 	adds	x24,x24,x16
   1146  1.1  christos 	sub	x14,x3,x5	// rewinded ap
   1147  1.1  christos 	adc	x25,xzr,xzr		// t[14]
   1148  1.1  christos 	add	x25,x25,x17
   1149  1.1  christos 
   1150  1.1  christos 	cbz	x27,.Lsqr8x_outer_break
   1151  1.1  christos 
   1152  1.1  christos 	mov	x4,x6
   1153  1.1  christos 	ldp	x6,x7,[x2,#8*0]
   1154  1.1  christos 	ldp	x8,x9,[x2,#8*2]
   1155  1.1  christos 	ldp	x10,x11,[x2,#8*4]
   1156  1.1  christos 	ldp	x12,x13,[x2,#8*6]
   1157  1.1  christos 	adds	x19,x19,x6
   1158  1.1  christos 	adcs	x20,x20,x7
   1159  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1160  1.1  christos 	adcs	x21,x21,x8
   1161  1.1  christos 	adcs	x22,x22,x9
   1162  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1163  1.1  christos 	adcs	x23,x23,x10
   1164  1.1  christos 	adcs	x24,x24,x11
   1165  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1166  1.1  christos 	adcs	x25,x25,x12
   1167  1.1  christos 	mov	x0,x1
   1168  1.1  christos 	adcs	x26,xzr,x13
   1169  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1170  1.1  christos 	add	x1,x1,#8*8
   1171  1.1  christos 	//adc	x28,xzr,xzr		// moved below
   1172  1.1  christos 	mov	x27,#-8*8
   1173  1.1  christos 
   1174  1.1  christos 	//                                                         a[8]a[0]
   1175  1.1  christos 	//                                                     a[9]a[0]
   1176  1.1  christos 	//                                                 a[a]a[0]
   1177  1.1  christos 	//                                             a[b]a[0]
   1178  1.1  christos 	//                                         a[c]a[0]
   1179  1.1  christos 	//                                     a[d]a[0]
   1180  1.1  christos 	//                                 a[e]a[0]
   1181  1.1  christos 	//                             a[f]a[0]
   1182  1.1  christos 	//                                                     a[8]a[1]
   1183  1.1  christos 	//                         a[f]a[1]........................
   1184  1.1  christos 	//                                                 a[8]a[2]
   1185  1.1  christos 	//                     a[f]a[2]........................
   1186  1.1  christos 	//                                             a[8]a[3]
   1187  1.1  christos 	//                 a[f]a[3]........................
   1188  1.1  christos 	//                                         a[8]a[4]
   1189  1.1  christos 	//             a[f]a[4]........................
   1190  1.1  christos 	//                                     a[8]a[5]
   1191  1.1  christos 	//         a[f]a[5]........................
   1192  1.1  christos 	//                                 a[8]a[6]
   1193  1.1  christos 	//     a[f]a[6]........................
   1194  1.1  christos 	//                             a[8]a[7]
   1195  1.1  christos 	// a[f]a[7]........................
   1196  1.1  christos .Lsqr8x_mul:
   1197  1.1  christos 	mul	x14,x6,x4
   1198  1.1  christos 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
   1199  1.1  christos 	mul	x15,x7,x4
   1200  1.1  christos 	add	x27,x27,#8
   1201  1.1  christos 	mul	x16,x8,x4
   1202  1.1  christos 	mul	x17,x9,x4
   1203  1.1  christos 	adds	x19,x19,x14
   1204  1.1  christos 	mul	x14,x10,x4
   1205  1.1  christos 	adcs	x20,x20,x15
   1206  1.1  christos 	mul	x15,x11,x4
   1207  1.1  christos 	adcs	x21,x21,x16
   1208  1.1  christos 	mul	x16,x12,x4
   1209  1.1  christos 	adcs	x22,x22,x17
   1210  1.1  christos 	mul	x17,x13,x4
   1211  1.1  christos 	adcs	x23,x23,x14
   1212  1.1  christos 	umulh	x14,x6,x4
   1213  1.1  christos 	adcs	x24,x24,x15
   1214  1.1  christos 	umulh	x15,x7,x4
   1215  1.1  christos 	adcs	x25,x25,x16
   1216  1.1  christos 	umulh	x16,x8,x4
   1217  1.1  christos 	adcs	x26,x26,x17
   1218  1.1  christos 	umulh	x17,x9,x4
   1219  1.1  christos 	adc	x28,x28,xzr
   1220  1.1  christos 	str	x19,[x2],#8
   1221  1.1  christos 	adds	x19,x20,x14
   1222  1.1  christos 	umulh	x14,x10,x4
   1223  1.1  christos 	adcs	x20,x21,x15
   1224  1.1  christos 	umulh	x15,x11,x4
   1225  1.1  christos 	adcs	x21,x22,x16
   1226  1.1  christos 	umulh	x16,x12,x4
   1227  1.1  christos 	adcs	x22,x23,x17
   1228  1.1  christos 	umulh	x17,x13,x4
   1229  1.1  christos 	ldr	x4,[x0,x27]
   1230  1.1  christos 	adcs	x23,x24,x14
   1231  1.1  christos 	adcs	x24,x25,x15
   1232  1.1  christos 	adcs	x25,x26,x16
   1233  1.1  christos 	adcs	x26,x28,x17
   1234  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1235  1.1  christos 	cbnz	x27,.Lsqr8x_mul
   1236  1.1  christos 					// note that carry flag is guaranteed
   1237  1.1  christos 					// to be zero at this point
   1238  1.1  christos 	cmp	x1,x3		// done yet?
   1239  1.1  christos 	b.eq	.Lsqr8x_break
   1240  1.1  christos 
   1241  1.1  christos 	ldp	x6,x7,[x2,#8*0]
   1242  1.1  christos 	ldp	x8,x9,[x2,#8*2]
   1243  1.1  christos 	ldp	x10,x11,[x2,#8*4]
   1244  1.1  christos 	ldp	x12,x13,[x2,#8*6]
   1245  1.1  christos 	adds	x19,x19,x6
   1246  1.1  christos 	ldur	x4,[x0,#-8*8]
   1247  1.1  christos 	adcs	x20,x20,x7
   1248  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1249  1.1  christos 	adcs	x21,x21,x8
   1250  1.1  christos 	adcs	x22,x22,x9
   1251  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1252  1.1  christos 	adcs	x23,x23,x10
   1253  1.1  christos 	adcs	x24,x24,x11
   1254  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1255  1.1  christos 	adcs	x25,x25,x12
   1256  1.1  christos 	mov	x27,#-8*8
   1257  1.1  christos 	adcs	x26,x26,x13
   1258  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1259  1.1  christos 	add	x1,x1,#8*8
   1260  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1261  1.1  christos 	b	.Lsqr8x_mul
   1262  1.1  christos 
   1263  1.1  christos .align	4
   1264  1.1  christos .Lsqr8x_break:
   1265  1.1  christos 	ldp	x6,x7,[x0,#8*0]
   1266  1.1  christos 	add	x1,x0,#8*8
   1267  1.1  christos 	ldp	x8,x9,[x0,#8*2]
   1268  1.1  christos 	sub	x14,x3,x1		// is it last iteration?
   1269  1.1  christos 	ldp	x10,x11,[x0,#8*4]
   1270  1.1  christos 	sub	x15,x2,x14
   1271  1.1  christos 	ldp	x12,x13,[x0,#8*6]
   1272  1.1  christos 	cbz	x14,.Lsqr8x_outer_loop
   1273  1.1  christos 
   1274  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1275  1.1  christos 	ldp	x19,x20,[x15,#8*0]
   1276  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1277  1.1  christos 	ldp	x21,x22,[x15,#8*2]
   1278  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1279  1.1  christos 	ldp	x23,x24,[x15,#8*4]
   1280  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1281  1.1  christos 	mov	x2,x15
   1282  1.1  christos 	ldp	x25,x26,[x15,#8*6]
   1283  1.1  christos 	b	.Lsqr8x_outer_loop
   1284  1.1  christos 
   1285  1.1  christos .align	4
   1286  1.1  christos .Lsqr8x_outer_break:
   1287  1.1  christos 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
   1288  1.1  christos 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
   1289  1.1  christos 	ldp	x15,x16,[sp,#8*1]
   1290  1.1  christos 	ldp	x11,x13,[x14,#8*2]
   1291  1.1  christos 	add	x1,x14,#8*4
   1292  1.1  christos 	ldp	x17,x14,[sp,#8*3]
   1293  1.1  christos 
   1294  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1295  1.1  christos 	mul	x19,x7,x7
   1296  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1297  1.1  christos 	umulh	x7,x7,x7
   1298  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1299  1.1  christos 	mul	x8,x9,x9
   1300  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1301  1.1  christos 	mov	x2,sp
   1302  1.1  christos 	umulh	x9,x9,x9
   1303  1.1  christos 	adds	x20,x7,x15,lsl#1
   1304  1.1  christos 	extr	x15,x16,x15,#63
   1305  1.1  christos 	sub	x27,x5,#8*4
   1306  1.1  christos 
   1307  1.1  christos .Lsqr4x_shift_n_add:
   1308  1.1  christos 	adcs	x21,x8,x15
   1309  1.1  christos 	extr	x16,x17,x16,#63
   1310  1.1  christos 	sub	x27,x27,#8*4
   1311  1.1  christos 	adcs	x22,x9,x16
   1312  1.1  christos 	ldp	x15,x16,[x2,#8*5]
   1313  1.1  christos 	mul	x10,x11,x11
   1314  1.1  christos 	ldp	x7,x9,[x1],#8*2
   1315  1.1  christos 	umulh	x11,x11,x11
   1316  1.1  christos 	mul	x12,x13,x13
   1317  1.1  christos 	umulh	x13,x13,x13
   1318  1.1  christos 	extr	x17,x14,x17,#63
   1319  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1320  1.1  christos 	adcs	x23,x10,x17
   1321  1.1  christos 	extr	x14,x15,x14,#63
   1322  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1323  1.1  christos 	adcs	x24,x11,x14
   1324  1.1  christos 	ldp	x17,x14,[x2,#8*7]
   1325  1.1  christos 	extr	x15,x16,x15,#63
   1326  1.1  christos 	adcs	x25,x12,x15
   1327  1.1  christos 	extr	x16,x17,x16,#63
   1328  1.1  christos 	adcs	x26,x13,x16
   1329  1.1  christos 	ldp	x15,x16,[x2,#8*9]
   1330  1.1  christos 	mul	x6,x7,x7
   1331  1.1  christos 	ldp	x11,x13,[x1],#8*2
   1332  1.1  christos 	umulh	x7,x7,x7
   1333  1.1  christos 	mul	x8,x9,x9
   1334  1.1  christos 	umulh	x9,x9,x9
   1335  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1336  1.1  christos 	extr	x17,x14,x17,#63
   1337  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1338  1.1  christos 	add	x2,x2,#8*8
   1339  1.1  christos 	adcs	x19,x6,x17
   1340  1.1  christos 	extr	x14,x15,x14,#63
   1341  1.1  christos 	adcs	x20,x7,x14
   1342  1.1  christos 	ldp	x17,x14,[x2,#8*3]
   1343  1.1  christos 	extr	x15,x16,x15,#63
   1344  1.1  christos 	cbnz	x27,.Lsqr4x_shift_n_add
   1345  1.1  christos 	ldp	x1,x4,[x29,#104]	// pull np and n0
   1346  1.1  christos 
   1347  1.1  christos 	adcs	x21,x8,x15
   1348  1.1  christos 	extr	x16,x17,x16,#63
   1349  1.1  christos 	adcs	x22,x9,x16
   1350  1.1  christos 	ldp	x15,x16,[x2,#8*5]
   1351  1.1  christos 	mul	x10,x11,x11
   1352  1.1  christos 	umulh	x11,x11,x11
   1353  1.1  christos 	stp	x19,x20,[x2,#8*0]
   1354  1.1  christos 	mul	x12,x13,x13
   1355  1.1  christos 	umulh	x13,x13,x13
   1356  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1357  1.1  christos 	extr	x17,x14,x17,#63
   1358  1.1  christos 	adcs	x23,x10,x17
   1359  1.1  christos 	extr	x14,x15,x14,#63
   1360  1.1  christos 	ldp	x19,x20,[sp,#8*0]
   1361  1.1  christos 	adcs	x24,x11,x14
   1362  1.1  christos 	extr	x15,x16,x15,#63
   1363  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1364  1.1  christos 	adcs	x25,x12,x15
   1365  1.1  christos 	extr	x16,xzr,x16,#63
   1366  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1367  1.1  christos 	adc	x26,x13,x16
   1368  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1369  1.1  christos 
   1370  1.1  christos 	// Reduce by 512 bits per iteration
   1371  1.1  christos 	mul	x28,x4,x19		// t[0]*n0
   1372  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1373  1.1  christos 	add	x3,x1,x5
   1374  1.1  christos 	ldp	x21,x22,[sp,#8*2]
   1375  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1376  1.1  christos 	ldp	x23,x24,[sp,#8*4]
   1377  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1378  1.1  christos 	ldp	x25,x26,[sp,#8*6]
   1379  1.1  christos 	add	x1,x1,#8*8
   1380  1.1  christos 	mov	x30,xzr		// initial top-most carry
   1381  1.1  christos 	mov	x2,sp
   1382  1.1  christos 	mov	x27,#8
   1383  1.1  christos 
   1384  1.1  christos .Lsqr8x_reduction:
   1385  1.1  christos 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
   1386  1.1  christos 	mul	x15,x7,x28
   1387  1.1  christos 	sub	x27,x27,#1
   1388  1.1  christos 	mul	x16,x8,x28
   1389  1.1  christos 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
   1390  1.1  christos 	mul	x17,x9,x28
   1391  1.1  christos 	// (*)	adds	xzr,x19,x14
   1392  1.1  christos 	subs	xzr,x19,#1		// (*)
   1393  1.1  christos 	mul	x14,x10,x28
   1394  1.1  christos 	adcs	x19,x20,x15
   1395  1.1  christos 	mul	x15,x11,x28
   1396  1.1  christos 	adcs	x20,x21,x16
   1397  1.1  christos 	mul	x16,x12,x28
   1398  1.1  christos 	adcs	x21,x22,x17
   1399  1.1  christos 	mul	x17,x13,x28
   1400  1.1  christos 	adcs	x22,x23,x14
   1401  1.1  christos 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
   1402  1.1  christos 	adcs	x23,x24,x15
   1403  1.1  christos 	umulh	x15,x7,x28
   1404  1.1  christos 	adcs	x24,x25,x16
   1405  1.1  christos 	umulh	x16,x8,x28
   1406  1.1  christos 	adcs	x25,x26,x17
   1407  1.1  christos 	umulh	x17,x9,x28
   1408  1.1  christos 	adc	x26,xzr,xzr
   1409  1.1  christos 	adds	x19,x19,x14
   1410  1.1  christos 	umulh	x14,x10,x28
   1411  1.1  christos 	adcs	x20,x20,x15
   1412  1.1  christos 	umulh	x15,x11,x28
   1413  1.1  christos 	adcs	x21,x21,x16
   1414  1.1  christos 	umulh	x16,x12,x28
   1415  1.1  christos 	adcs	x22,x22,x17
   1416  1.1  christos 	umulh	x17,x13,x28
   1417  1.1  christos 	mul	x28,x4,x19		// next t[0]*n0
   1418  1.1  christos 	adcs	x23,x23,x14
   1419  1.1  christos 	adcs	x24,x24,x15
   1420  1.1  christos 	adcs	x25,x25,x16
   1421  1.1  christos 	adc	x26,x26,x17
   1422  1.1  christos 	cbnz	x27,.Lsqr8x_reduction
   1423  1.1  christos 
   1424  1.1  christos 	ldp	x14,x15,[x2,#8*0]
   1425  1.1  christos 	ldp	x16,x17,[x2,#8*2]
   1426  1.1  christos 	mov	x0,x2
   1427  1.1  christos 	sub	x27,x3,x1	// done yet?
   1428  1.1  christos 	adds	x19,x19,x14
   1429  1.1  christos 	adcs	x20,x20,x15
   1430  1.1  christos 	ldp	x14,x15,[x2,#8*4]
   1431  1.1  christos 	adcs	x21,x21,x16
   1432  1.1  christos 	adcs	x22,x22,x17
   1433  1.1  christos 	ldp	x16,x17,[x2,#8*6]
   1434  1.1  christos 	adcs	x23,x23,x14
   1435  1.1  christos 	adcs	x24,x24,x15
   1436  1.1  christos 	adcs	x25,x25,x16
   1437  1.1  christos 	adcs	x26,x26,x17
   1438  1.1  christos 	//adc	x28,xzr,xzr		// moved below
   1439  1.1  christos 	cbz	x27,.Lsqr8x8_post_condition
   1440  1.1  christos 
   1441  1.1  christos 	ldur	x4,[x2,#-8*8]
   1442  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1443  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1444  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1445  1.1  christos 	mov	x27,#-8*8
   1446  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1447  1.1  christos 	add	x1,x1,#8*8
   1448  1.1  christos 
   1449  1.1  christos .Lsqr8x_tail:
   1450  1.1  christos 	mul	x14,x6,x4
   1451  1.1  christos 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
   1452  1.1  christos 	mul	x15,x7,x4
   1453  1.1  christos 	add	x27,x27,#8
   1454  1.1  christos 	mul	x16,x8,x4
   1455  1.1  christos 	mul	x17,x9,x4
   1456  1.1  christos 	adds	x19,x19,x14
   1457  1.1  christos 	mul	x14,x10,x4
   1458  1.1  christos 	adcs	x20,x20,x15
   1459  1.1  christos 	mul	x15,x11,x4
   1460  1.1  christos 	adcs	x21,x21,x16
   1461  1.1  christos 	mul	x16,x12,x4
   1462  1.1  christos 	adcs	x22,x22,x17
   1463  1.1  christos 	mul	x17,x13,x4
   1464  1.1  christos 	adcs	x23,x23,x14
   1465  1.1  christos 	umulh	x14,x6,x4
   1466  1.1  christos 	adcs	x24,x24,x15
   1467  1.1  christos 	umulh	x15,x7,x4
   1468  1.1  christos 	adcs	x25,x25,x16
   1469  1.1  christos 	umulh	x16,x8,x4
   1470  1.1  christos 	adcs	x26,x26,x17
   1471  1.1  christos 	umulh	x17,x9,x4
   1472  1.1  christos 	adc	x28,x28,xzr
   1473  1.1  christos 	str	x19,[x2],#8
   1474  1.1  christos 	adds	x19,x20,x14
   1475  1.1  christos 	umulh	x14,x10,x4
   1476  1.1  christos 	adcs	x20,x21,x15
   1477  1.1  christos 	umulh	x15,x11,x4
   1478  1.1  christos 	adcs	x21,x22,x16
   1479  1.1  christos 	umulh	x16,x12,x4
   1480  1.1  christos 	adcs	x22,x23,x17
   1481  1.1  christos 	umulh	x17,x13,x4
   1482  1.1  christos 	ldr	x4,[x0,x27]
   1483  1.1  christos 	adcs	x23,x24,x14
   1484  1.1  christos 	adcs	x24,x25,x15
   1485  1.1  christos 	adcs	x25,x26,x16
   1486  1.1  christos 	adcs	x26,x28,x17
   1487  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1488  1.1  christos 	cbnz	x27,.Lsqr8x_tail
   1489  1.1  christos 					// note that carry flag is guaranteed
   1490  1.1  christos 					// to be zero at this point
   1491  1.1  christos 	ldp	x6,x7,[x2,#8*0]
   1492  1.1  christos 	sub	x27,x3,x1	// done yet?
   1493  1.1  christos 	sub	x16,x3,x5	// rewinded np
   1494  1.1  christos 	ldp	x8,x9,[x2,#8*2]
   1495  1.1  christos 	ldp	x10,x11,[x2,#8*4]
   1496  1.1  christos 	ldp	x12,x13,[x2,#8*6]
   1497  1.1  christos 	cbz	x27,.Lsqr8x_tail_break
   1498  1.1  christos 
   1499  1.1  christos 	ldur	x4,[x0,#-8*8]
   1500  1.1  christos 	adds	x19,x19,x6
   1501  1.1  christos 	adcs	x20,x20,x7
   1502  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1503  1.1  christos 	adcs	x21,x21,x8
   1504  1.1  christos 	adcs	x22,x22,x9
   1505  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1506  1.1  christos 	adcs	x23,x23,x10
   1507  1.1  christos 	adcs	x24,x24,x11
   1508  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1509  1.1  christos 	adcs	x25,x25,x12
   1510  1.1  christos 	mov	x27,#-8*8
   1511  1.1  christos 	adcs	x26,x26,x13
   1512  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1513  1.1  christos 	add	x1,x1,#8*8
   1514  1.1  christos 	//adc	x28,xzr,xzr		// moved above
   1515  1.1  christos 	b	.Lsqr8x_tail
   1516  1.1  christos 
   1517  1.1  christos .align	4
   1518  1.1  christos .Lsqr8x_tail_break:
   1519  1.1  christos 	ldr	x4,[x29,#112]		// pull n0
   1520  1.1  christos 	add	x27,x2,#8*8		// end of current t[num] window
   1521  1.1  christos 
   1522  1.1  christos 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
   1523  1.1  christos 	adcs	x14,x19,x6
   1524  1.1  christos 	adcs	x15,x20,x7
   1525  1.1  christos 	ldp	x19,x20,[x0,#8*0]
   1526  1.1  christos 	adcs	x21,x21,x8
   1527  1.1  christos 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
   1528  1.1  christos 	adcs	x22,x22,x9
   1529  1.1  christos 	ldp	x8,x9,[x16,#8*2]
   1530  1.1  christos 	adcs	x23,x23,x10
   1531  1.1  christos 	adcs	x24,x24,x11
   1532  1.1  christos 	ldp	x10,x11,[x16,#8*4]
   1533  1.1  christos 	adcs	x25,x25,x12
   1534  1.1  christos 	adcs	x26,x26,x13
   1535  1.1  christos 	ldp	x12,x13,[x16,#8*6]
   1536  1.1  christos 	add	x1,x16,#8*8
   1537  1.1  christos 	adc	x30,xzr,xzr	// top-most carry
   1538  1.1  christos 	mul	x28,x4,x19
   1539  1.1  christos 	stp	x14,x15,[x2,#8*0]
   1540  1.1  christos 	stp	x21,x22,[x2,#8*2]
   1541  1.1  christos 	ldp	x21,x22,[x0,#8*2]
   1542  1.1  christos 	stp	x23,x24,[x2,#8*4]
   1543  1.1  christos 	ldp	x23,x24,[x0,#8*4]
   1544  1.1  christos 	cmp	x27,x29		// did we hit the bottom?
   1545  1.1  christos 	stp	x25,x26,[x2,#8*6]
   1546  1.1  christos 	mov	x2,x0			// slide the window
   1547  1.1  christos 	ldp	x25,x26,[x0,#8*6]
   1548  1.1  christos 	mov	x27,#8
   1549  1.1  christos 	b.ne	.Lsqr8x_reduction
   1550  1.1  christos 
   1551  1.1  christos 	// Final step. We see if result is larger than modulus, and
   1552  1.1  christos 	// if it is, subtract the modulus. But comparison implies
   1553  1.1  christos 	// subtraction. So we subtract modulus, see if it borrowed,
   1554  1.1  christos 	// and conditionally copy original value.
   1555  1.1  christos 	ldr	x0,[x29,#96]		// pull rp
   1556  1.1  christos 	add	x2,x2,#8*8
   1557  1.1  christos 	subs	x14,x19,x6
   1558  1.1  christos 	sbcs	x15,x20,x7
   1559  1.1  christos 	sub	x27,x5,#8*8
   1560  1.1  christos 	mov	x3,x0		// x0 copy
   1561  1.1  christos 
   1562  1.1  christos .Lsqr8x_sub:
   1563  1.1  christos 	sbcs	x16,x21,x8
   1564  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1565  1.1  christos 	sbcs	x17,x22,x9
   1566  1.1  christos 	stp	x14,x15,[x0,#8*0]
   1567  1.1  christos 	sbcs	x14,x23,x10
   1568  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1569  1.1  christos 	sbcs	x15,x24,x11
   1570  1.1  christos 	stp	x16,x17,[x0,#8*2]
   1571  1.1  christos 	sbcs	x16,x25,x12
   1572  1.1  christos 	ldp	x10,x11,[x1,#8*4]
   1573  1.1  christos 	sbcs	x17,x26,x13
   1574  1.1  christos 	ldp	x12,x13,[x1,#8*6]
   1575  1.1  christos 	add	x1,x1,#8*8
   1576  1.1  christos 	ldp	x19,x20,[x2,#8*0]
   1577  1.1  christos 	sub	x27,x27,#8*8
   1578  1.1  christos 	ldp	x21,x22,[x2,#8*2]
   1579  1.1  christos 	ldp	x23,x24,[x2,#8*4]
   1580  1.1  christos 	ldp	x25,x26,[x2,#8*6]
   1581  1.1  christos 	add	x2,x2,#8*8
   1582  1.1  christos 	stp	x14,x15,[x0,#8*4]
   1583  1.1  christos 	sbcs	x14,x19,x6
   1584  1.1  christos 	stp	x16,x17,[x0,#8*6]
   1585  1.1  christos 	add	x0,x0,#8*8
   1586  1.1  christos 	sbcs	x15,x20,x7
   1587  1.1  christos 	cbnz	x27,.Lsqr8x_sub
   1588  1.1  christos 
   1589  1.1  christos 	sbcs	x16,x21,x8
   1590  1.1  christos 	mov	x2,sp
   1591  1.1  christos 	add	x1,sp,x5
   1592  1.1  christos 	ldp	x6,x7,[x3,#8*0]
   1593  1.1  christos 	sbcs	x17,x22,x9
   1594  1.1  christos 	stp	x14,x15,[x0,#8*0]
   1595  1.1  christos 	sbcs	x14,x23,x10
   1596  1.1  christos 	ldp	x8,x9,[x3,#8*2]
   1597  1.1  christos 	sbcs	x15,x24,x11
   1598  1.1  christos 	stp	x16,x17,[x0,#8*2]
   1599  1.1  christos 	sbcs	x16,x25,x12
   1600  1.1  christos 	ldp	x19,x20,[x1,#8*0]
   1601  1.1  christos 	sbcs	x17,x26,x13
   1602  1.1  christos 	ldp	x21,x22,[x1,#8*2]
   1603  1.1  christos 	sbcs	xzr,x30,xzr	// did it borrow?
   1604  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   1605  1.1  christos 	stp	x14,x15,[x0,#8*4]
   1606  1.1  christos 	stp	x16,x17,[x0,#8*6]
   1607  1.1  christos 
   1608  1.1  christos 	sub	x27,x5,#8*4
   1609  1.1  christos .Lsqr4x_cond_copy:
   1610  1.1  christos 	sub	x27,x27,#8*4
   1611  1.1  christos 	csel	x14,x19,x6,lo
   1612  1.1  christos 	stp	xzr,xzr,[x2,#8*0]
   1613  1.1  christos 	csel	x15,x20,x7,lo
   1614  1.1  christos 	ldp	x6,x7,[x3,#8*4]
   1615  1.1  christos 	ldp	x19,x20,[x1,#8*4]
   1616  1.1  christos 	csel	x16,x21,x8,lo
   1617  1.1  christos 	stp	xzr,xzr,[x2,#8*2]
   1618  1.1  christos 	add	x2,x2,#8*4
   1619  1.1  christos 	csel	x17,x22,x9,lo
   1620  1.1  christos 	ldp	x8,x9,[x3,#8*6]
   1621  1.1  christos 	ldp	x21,x22,[x1,#8*6]
   1622  1.1  christos 	add	x1,x1,#8*4
   1623  1.1  christos 	stp	x14,x15,[x3,#8*0]
   1624  1.1  christos 	stp	x16,x17,[x3,#8*2]
   1625  1.1  christos 	add	x3,x3,#8*4
   1626  1.1  christos 	stp	xzr,xzr,[x1,#8*0]
   1627  1.1  christos 	stp	xzr,xzr,[x1,#8*2]
   1628  1.1  christos 	cbnz	x27,.Lsqr4x_cond_copy
   1629  1.1  christos 
   1630  1.1  christos 	csel	x14,x19,x6,lo
   1631  1.1  christos 	stp	xzr,xzr,[x2,#8*0]
   1632  1.1  christos 	csel	x15,x20,x7,lo
   1633  1.1  christos 	stp	xzr,xzr,[x2,#8*2]
   1634  1.1  christos 	csel	x16,x21,x8,lo
   1635  1.1  christos 	csel	x17,x22,x9,lo
   1636  1.1  christos 	stp	x14,x15,[x3,#8*0]
   1637  1.1  christos 	stp	x16,x17,[x3,#8*2]
   1638  1.1  christos 
   1639  1.1  christos 	b	.Lsqr8x_done
   1640  1.1  christos 
   1641  1.1  christos .align	4
   1642  1.1  christos .Lsqr8x8_post_condition:
   1643  1.1  christos 	adc	x28,xzr,xzr
   1644  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   1645  1.1  christos 	// x19-7,x28 hold result, x6-7 hold modulus
   1646  1.1  christos 	subs	x6,x19,x6
   1647  1.1  christos 	ldr	x1,[x29,#96]		// pull rp
   1648  1.1  christos 	sbcs	x7,x20,x7
   1649  1.1  christos 	stp	xzr,xzr,[sp,#8*0]
   1650  1.1  christos 	sbcs	x8,x21,x8
   1651  1.1  christos 	stp	xzr,xzr,[sp,#8*2]
   1652  1.1  christos 	sbcs	x9,x22,x9
   1653  1.1  christos 	stp	xzr,xzr,[sp,#8*4]
   1654  1.1  christos 	sbcs	x10,x23,x10
   1655  1.1  christos 	stp	xzr,xzr,[sp,#8*6]
   1656  1.1  christos 	sbcs	x11,x24,x11
   1657  1.1  christos 	stp	xzr,xzr,[sp,#8*8]
   1658  1.1  christos 	sbcs	x12,x25,x12
   1659  1.1  christos 	stp	xzr,xzr,[sp,#8*10]
   1660  1.1  christos 	sbcs	x13,x26,x13
   1661  1.1  christos 	stp	xzr,xzr,[sp,#8*12]
   1662  1.1  christos 	sbcs	x28,x28,xzr	// did it borrow?
   1663  1.1  christos 	stp	xzr,xzr,[sp,#8*14]
   1664  1.1  christos 
   1665  1.1  christos 	// x6-7 hold result-modulus
   1666  1.1  christos 	csel	x6,x19,x6,lo
   1667  1.1  christos 	csel	x7,x20,x7,lo
   1668  1.1  christos 	csel	x8,x21,x8,lo
   1669  1.1  christos 	csel	x9,x22,x9,lo
   1670  1.1  christos 	stp	x6,x7,[x1,#8*0]
   1671  1.1  christos 	csel	x10,x23,x10,lo
   1672  1.1  christos 	csel	x11,x24,x11,lo
   1673  1.1  christos 	stp	x8,x9,[x1,#8*2]
   1674  1.1  christos 	csel	x12,x25,x12,lo
   1675  1.1  christos 	csel	x13,x26,x13,lo
   1676  1.1  christos 	stp	x10,x11,[x1,#8*4]
   1677  1.1  christos 	stp	x12,x13,[x1,#8*6]
   1678  1.1  christos 
   1679  1.1  christos .Lsqr8x_done:
   1680  1.1  christos 	ldp	x19,x20,[x29,#16]
   1681  1.1  christos 	mov	sp,x29
   1682  1.1  christos 	ldp	x21,x22,[x29,#32]
   1683  1.1  christos 	mov	x0,#1
   1684  1.1  christos 	ldp	x23,x24,[x29,#48]
   1685  1.1  christos 	ldp	x25,x26,[x29,#64]
   1686  1.1  christos 	ldp	x27,x28,[x29,#80]
   1687  1.1  christos 	ldr	x29,[sp],#128
   1688  1.2  christos 	// x30 is loaded earlier
   1689  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   1690  1.1  christos 	ret
   1691  1.1  christos .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
   1692  1.1  christos .type	__bn_mul4x_mont,%function
   1693  1.1  christos .align	5
   1694  1.1  christos __bn_mul4x_mont:
   1695  1.2  christos 	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
   1696  1.2  christos 	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
   1697  1.1  christos 	stp	x29,x30,[sp,#-128]!
   1698  1.1  christos 	add	x29,sp,#0
   1699  1.1  christos 	stp	x19,x20,[sp,#16]
   1700  1.1  christos 	stp	x21,x22,[sp,#32]
   1701  1.1  christos 	stp	x23,x24,[sp,#48]
   1702  1.1  christos 	stp	x25,x26,[sp,#64]
   1703  1.1  christos 	stp	x27,x28,[sp,#80]
   1704  1.1  christos 
   1705  1.1  christos 	sub	x26,sp,x5,lsl#3
   1706  1.1  christos 	lsl	x5,x5,#3
   1707  1.1  christos 	ldr	x4,[x4]		// *n0
   1708  1.1  christos 	sub	sp,x26,#8*4		// alloca
   1709  1.1  christos 
   1710  1.1  christos 	add	x10,x2,x5
   1711  1.1  christos 	add	x27,x1,x5
   1712  1.1  christos 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
   1713  1.1  christos 
   1714  1.1  christos 	ldr	x24,[x2,#8*0]		// b[0]
   1715  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   1716  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1717  1.1  christos 	add	x1,x1,#8*4
   1718  1.1  christos 	mov	x19,xzr
   1719  1.1  christos 	mov	x20,xzr
   1720  1.1  christos 	mov	x21,xzr
   1721  1.1  christos 	mov	x22,xzr
   1722  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1723  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1724  1.1  christos 	adds	x3,x3,#8*4		// clear carry bit
   1725  1.1  christos 	mov	x0,xzr
   1726  1.1  christos 	mov	x28,#0
   1727  1.1  christos 	mov	x26,sp
   1728  1.1  christos 
   1729  1.1  christos .Loop_mul4x_1st_reduction:
   1730  1.1  christos 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
   1731  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1732  1.1  christos 	mul	x11,x7,x24
   1733  1.1  christos 	add	x28,x28,#8
   1734  1.1  christos 	mul	x12,x8,x24
   1735  1.1  christos 	and	x28,x28,#31
   1736  1.1  christos 	mul	x13,x9,x24
   1737  1.1  christos 	adds	x19,x19,x10
   1738  1.1  christos 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
   1739  1.1  christos 	adcs	x20,x20,x11
   1740  1.1  christos 	mul	x25,x19,x4		// t[0]*n0
   1741  1.1  christos 	adcs	x21,x21,x12
   1742  1.1  christos 	umulh	x11,x7,x24
   1743  1.1  christos 	adcs	x22,x22,x13
   1744  1.1  christos 	umulh	x12,x8,x24
   1745  1.1  christos 	adc	x23,xzr,xzr
   1746  1.1  christos 	umulh	x13,x9,x24
   1747  1.1  christos 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1748  1.1  christos 	adds	x20,x20,x10
   1749  1.1  christos 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
   1750  1.1  christos 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1751  1.1  christos 	adcs	x21,x21,x11
   1752  1.1  christos 	mul	x11,x15,x25
   1753  1.1  christos 	adcs	x22,x22,x12
   1754  1.1  christos 	mul	x12,x16,x25
   1755  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1756  1.1  christos 	mul	x13,x17,x25
   1757  1.1  christos 	// (*)	adds	xzr,x19,x10
   1758  1.1  christos 	subs	xzr,x19,#1		// (*)
   1759  1.1  christos 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
   1760  1.1  christos 	adcs	x19,x20,x11
   1761  1.1  christos 	umulh	x11,x15,x25
   1762  1.1  christos 	adcs	x20,x21,x12
   1763  1.1  christos 	umulh	x12,x16,x25
   1764  1.1  christos 	adcs	x21,x22,x13
   1765  1.1  christos 	umulh	x13,x17,x25
   1766  1.1  christos 	adcs	x22,x23,x0
   1767  1.1  christos 	adc	x0,xzr,xzr
   1768  1.1  christos 	adds	x19,x19,x10
   1769  1.1  christos 	sub	x10,x27,x1
   1770  1.1  christos 	adcs	x20,x20,x11
   1771  1.1  christos 	adcs	x21,x21,x12
   1772  1.1  christos 	adcs	x22,x22,x13
   1773  1.1  christos 	//adc	x0,x0,xzr
   1774  1.1  christos 	cbnz	x28,.Loop_mul4x_1st_reduction
   1775  1.1  christos 
   1776  1.1  christos 	cbz	x10,.Lmul4x4_post_condition
   1777  1.1  christos 
   1778  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1779  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1780  1.1  christos 	add	x1,x1,#8*4
   1781  1.1  christos 	ldr	x25,[sp]		// a[0]*n0
   1782  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1783  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1784  1.1  christos 	add	x3,x3,#8*4
   1785  1.1  christos 
   1786  1.1  christos .Loop_mul4x_1st_tail:
   1787  1.1  christos 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
   1788  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1789  1.1  christos 	mul	x11,x7,x24
   1790  1.1  christos 	add	x28,x28,#8
   1791  1.1  christos 	mul	x12,x8,x24
   1792  1.1  christos 	and	x28,x28,#31
   1793  1.1  christos 	mul	x13,x9,x24
   1794  1.1  christos 	adds	x19,x19,x10
   1795  1.1  christos 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
   1796  1.1  christos 	adcs	x20,x20,x11
   1797  1.1  christos 	umulh	x11,x7,x24
   1798  1.1  christos 	adcs	x21,x21,x12
   1799  1.1  christos 	umulh	x12,x8,x24
   1800  1.1  christos 	adcs	x22,x22,x13
   1801  1.1  christos 	umulh	x13,x9,x24
   1802  1.1  christos 	adc	x23,xzr,xzr
   1803  1.1  christos 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
   1804  1.1  christos 	adds	x20,x20,x10
   1805  1.1  christos 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
   1806  1.1  christos 	adcs	x21,x21,x11
   1807  1.1  christos 	mul	x11,x15,x25
   1808  1.1  christos 	adcs	x22,x22,x12
   1809  1.1  christos 	mul	x12,x16,x25
   1810  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1811  1.1  christos 	mul	x13,x17,x25
   1812  1.1  christos 	adds	x19,x19,x10
   1813  1.1  christos 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
   1814  1.1  christos 	adcs	x20,x20,x11
   1815  1.1  christos 	umulh	x11,x15,x25
   1816  1.1  christos 	adcs	x21,x21,x12
   1817  1.1  christos 	umulh	x12,x16,x25
   1818  1.1  christos 	adcs	x22,x22,x13
   1819  1.1  christos 	adcs	x23,x23,x0
   1820  1.1  christos 	umulh	x13,x17,x25
   1821  1.1  christos 	adc	x0,xzr,xzr
   1822  1.1  christos 	ldr	x25,[sp,x28]		// next t[0]*n0
   1823  1.1  christos 	str	x19,[x26],#8		// result!!!
   1824  1.1  christos 	adds	x19,x20,x10
   1825  1.1  christos 	sub	x10,x27,x1		// done yet?
   1826  1.1  christos 	adcs	x20,x21,x11
   1827  1.1  christos 	adcs	x21,x22,x12
   1828  1.1  christos 	adcs	x22,x23,x13
   1829  1.1  christos 	//adc	x0,x0,xzr
   1830  1.1  christos 	cbnz	x28,.Loop_mul4x_1st_tail
   1831  1.1  christos 
   1832  1.1  christos 	sub	x11,x27,x5	// rewinded x1
   1833  1.1  christos 	cbz	x10,.Lmul4x_proceed
   1834  1.1  christos 
   1835  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1836  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1837  1.1  christos 	add	x1,x1,#8*4
   1838  1.1  christos 	ldp	x14,x15,[x3,#8*0]
   1839  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1840  1.1  christos 	add	x3,x3,#8*4
   1841  1.1  christos 	b	.Loop_mul4x_1st_tail
   1842  1.1  christos 
   1843  1.1  christos .align	5
   1844  1.1  christos .Lmul4x_proceed:
   1845  1.1  christos 	ldr	x24,[x2,#8*4]!		// *++b
   1846  1.1  christos 	adc	x30,x0,xzr
   1847  1.1  christos 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
   1848  1.1  christos 	sub	x3,x3,x5		// rewind np
   1849  1.1  christos 	ldp	x8,x9,[x11,#8*2]
   1850  1.1  christos 	add	x1,x11,#8*4
   1851  1.1  christos 
   1852  1.1  christos 	stp	x19,x20,[x26,#8*0]	// result!!!
   1853  1.1  christos 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   1854  1.1  christos 	stp	x21,x22,[x26,#8*2]	// result!!!
   1855  1.1  christos 	ldp	x21,x22,[sp,#8*6]
   1856  1.1  christos 
   1857  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
   1858  1.1  christos 	mov	x26,sp
   1859  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1860  1.1  christos 	adds	x3,x3,#8*4		// clear carry bit
   1861  1.1  christos 	mov	x0,xzr
   1862  1.1  christos 
   1863  1.1  christos .align	4
   1864  1.1  christos .Loop_mul4x_reduction:
   1865  1.1  christos 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
   1866  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1867  1.1  christos 	mul	x11,x7,x24
   1868  1.1  christos 	add	x28,x28,#8
   1869  1.1  christos 	mul	x12,x8,x24
   1870  1.1  christos 	and	x28,x28,#31
   1871  1.1  christos 	mul	x13,x9,x24
   1872  1.1  christos 	adds	x19,x19,x10
   1873  1.1  christos 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
   1874  1.1  christos 	adcs	x20,x20,x11
   1875  1.1  christos 	mul	x25,x19,x4		// t[0]*n0
   1876  1.1  christos 	adcs	x21,x21,x12
   1877  1.1  christos 	umulh	x11,x7,x24
   1878  1.1  christos 	adcs	x22,x22,x13
   1879  1.1  christos 	umulh	x12,x8,x24
   1880  1.1  christos 	adc	x23,xzr,xzr
   1881  1.1  christos 	umulh	x13,x9,x24
   1882  1.1  christos 	ldr	x24,[x2,x28]		// next b[i]
   1883  1.1  christos 	adds	x20,x20,x10
   1884  1.1  christos 	// (*)	mul	x10,x14,x25
   1885  1.1  christos 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
   1886  1.1  christos 	adcs	x21,x21,x11
   1887  1.1  christos 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
   1888  1.1  christos 	adcs	x22,x22,x12
   1889  1.1  christos 	mul	x12,x16,x25
   1890  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1891  1.1  christos 	mul	x13,x17,x25
   1892  1.1  christos 	// (*)	adds	xzr,x19,x10
   1893  1.1  christos 	subs	xzr,x19,#1		// (*)
   1894  1.1  christos 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
   1895  1.1  christos 	adcs	x19,x20,x11
   1896  1.1  christos 	umulh	x11,x15,x25
   1897  1.1  christos 	adcs	x20,x21,x12
   1898  1.1  christos 	umulh	x12,x16,x25
   1899  1.1  christos 	adcs	x21,x22,x13
   1900  1.1  christos 	umulh	x13,x17,x25
   1901  1.1  christos 	adcs	x22,x23,x0
   1902  1.1  christos 	adc	x0,xzr,xzr
   1903  1.1  christos 	adds	x19,x19,x10
   1904  1.1  christos 	adcs	x20,x20,x11
   1905  1.1  christos 	adcs	x21,x21,x12
   1906  1.1  christos 	adcs	x22,x22,x13
   1907  1.1  christos 	//adc	x0,x0,xzr
   1908  1.1  christos 	cbnz	x28,.Loop_mul4x_reduction
   1909  1.1  christos 
   1910  1.1  christos 	adc	x0,x0,xzr
   1911  1.1  christos 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
   1912  1.1  christos 	ldp	x12,x13,[x26,#8*6]
   1913  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
   1914  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1915  1.1  christos 	add	x1,x1,#8*4
   1916  1.1  christos 	adds	x19,x19,x10
   1917  1.1  christos 	adcs	x20,x20,x11
   1918  1.1  christos 	adcs	x21,x21,x12
   1919  1.1  christos 	adcs	x22,x22,x13
   1920  1.1  christos 	//adc	x0,x0,xzr
   1921  1.1  christos 
   1922  1.1  christos 	ldr	x25,[sp]		// t[0]*n0
   1923  1.1  christos 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
   1924  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1925  1.1  christos 	add	x3,x3,#8*4
   1926  1.1  christos 
   1927  1.1  christos .align	4
   1928  1.1  christos .Loop_mul4x_tail:
   1929  1.1  christos 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
   1930  1.1  christos 	adc	x0,x0,xzr	// modulo-scheduled
   1931  1.1  christos 	mul	x11,x7,x24
   1932  1.1  christos 	add	x28,x28,#8
   1933  1.1  christos 	mul	x12,x8,x24
   1934  1.1  christos 	and	x28,x28,#31
   1935  1.1  christos 	mul	x13,x9,x24
   1936  1.1  christos 	adds	x19,x19,x10
   1937  1.1  christos 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
   1938  1.1  christos 	adcs	x20,x20,x11
   1939  1.1  christos 	umulh	x11,x7,x24
   1940  1.1  christos 	adcs	x21,x21,x12
   1941  1.1  christos 	umulh	x12,x8,x24
   1942  1.1  christos 	adcs	x22,x22,x13
   1943  1.1  christos 	umulh	x13,x9,x24
   1944  1.1  christos 	adc	x23,xzr,xzr
   1945  1.1  christos 	ldr	x24,[x2,x28]		// next b[i]
   1946  1.1  christos 	adds	x20,x20,x10
   1947  1.1  christos 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
   1948  1.1  christos 	adcs	x21,x21,x11
   1949  1.1  christos 	mul	x11,x15,x25
   1950  1.1  christos 	adcs	x22,x22,x12
   1951  1.1  christos 	mul	x12,x16,x25
   1952  1.1  christos 	adc	x23,x23,x13		// can't overflow
   1953  1.1  christos 	mul	x13,x17,x25
   1954  1.1  christos 	adds	x19,x19,x10
   1955  1.1  christos 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
   1956  1.1  christos 	adcs	x20,x20,x11
   1957  1.1  christos 	umulh	x11,x15,x25
   1958  1.1  christos 	adcs	x21,x21,x12
   1959  1.1  christos 	umulh	x12,x16,x25
   1960  1.1  christos 	adcs	x22,x22,x13
   1961  1.1  christos 	umulh	x13,x17,x25
   1962  1.1  christos 	adcs	x23,x23,x0
   1963  1.1  christos 	ldr	x25,[sp,x28]		// next a[0]*n0
   1964  1.1  christos 	adc	x0,xzr,xzr
   1965  1.1  christos 	str	x19,[x26],#8		// result!!!
   1966  1.1  christos 	adds	x19,x20,x10
   1967  1.1  christos 	sub	x10,x27,x1		// done yet?
   1968  1.1  christos 	adcs	x20,x21,x11
   1969  1.1  christos 	adcs	x21,x22,x12
   1970  1.1  christos 	adcs	x22,x23,x13
   1971  1.1  christos 	//adc	x0,x0,xzr
   1972  1.1  christos 	cbnz	x28,.Loop_mul4x_tail
   1973  1.1  christos 
   1974  1.1  christos 	sub	x11,x3,x5		// rewinded np?
   1975  1.1  christos 	adc	x0,x0,xzr
   1976  1.1  christos 	cbz	x10,.Loop_mul4x_break
   1977  1.1  christos 
   1978  1.1  christos 	ldp	x10,x11,[x26,#8*4]
   1979  1.1  christos 	ldp	x12,x13,[x26,#8*6]
   1980  1.1  christos 	ldp	x6,x7,[x1,#8*0]
   1981  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   1982  1.1  christos 	add	x1,x1,#8*4
   1983  1.1  christos 	adds	x19,x19,x10
   1984  1.1  christos 	adcs	x20,x20,x11
   1985  1.1  christos 	adcs	x21,x21,x12
   1986  1.1  christos 	adcs	x22,x22,x13
   1987  1.1  christos 	//adc	x0,x0,xzr
   1988  1.1  christos 	ldp	x14,x15,[x3,#8*0]
   1989  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   1990  1.1  christos 	add	x3,x3,#8*4
   1991  1.1  christos 	b	.Loop_mul4x_tail
   1992  1.1  christos 
   1993  1.1  christos .align	4
   1994  1.1  christos .Loop_mul4x_break:
   1995  1.1  christos 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
   1996  1.1  christos 	adds	x19,x19,x30
   1997  1.1  christos 	add	x2,x2,#8*4		// bp++
   1998  1.1  christos 	adcs	x20,x20,xzr
   1999  1.1  christos 	sub	x1,x1,x5		// rewind ap
   2000  1.1  christos 	adcs	x21,x21,xzr
   2001  1.1  christos 	stp	x19,x20,[x26,#8*0]	// result!!!
   2002  1.1  christos 	adcs	x22,x22,xzr
   2003  1.1  christos 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
   2004  1.1  christos 	adc	x30,x0,xzr
   2005  1.1  christos 	stp	x21,x22,[x26,#8*2]	// result!!!
   2006  1.1  christos 	cmp	x2,x13			// done yet?
   2007  1.1  christos 	ldp	x21,x22,[sp,#8*6]
   2008  1.1  christos 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
   2009  1.1  christos 	ldp	x16,x17,[x11,#8*2]
   2010  1.1  christos 	add	x3,x11,#8*4
   2011  1.1  christos 	b.eq	.Lmul4x_post
   2012  1.1  christos 
   2013  1.1  christos 	ldr	x24,[x2]
   2014  1.1  christos 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
   2015  1.1  christos 	ldp	x8,x9,[x1,#8*2]
   2016  1.1  christos 	adds	x1,x1,#8*4		// clear carry bit
   2017  1.1  christos 	mov	x0,xzr
   2018  1.1  christos 	mov	x26,sp
   2019  1.1  christos 	b	.Loop_mul4x_reduction
   2020  1.1  christos 
   2021  1.1  christos .align	4
   2022  1.1  christos .Lmul4x_post:
   2023  1.1  christos 	// Final step. We see if result is larger than modulus, and
   2024  1.1  christos 	// if it is, subtract the modulus. But comparison implies
   2025  1.1  christos 	// subtraction. So we subtract modulus, see if it borrowed,
   2026  1.1  christos 	// and conditionally copy original value.
   2027  1.1  christos 	mov	x0,x12
   2028  1.1  christos 	mov	x27,x12		// x0 copy
   2029  1.1  christos 	subs	x10,x19,x14
   2030  1.1  christos 	add	x26,sp,#8*8
   2031  1.1  christos 	sbcs	x11,x20,x15
   2032  1.1  christos 	sub	x28,x5,#8*4
   2033  1.1  christos 
   2034  1.1  christos .Lmul4x_sub:
   2035  1.1  christos 	sbcs	x12,x21,x16
   2036  1.1  christos 	ldp	x14,x15,[x3,#8*0]
   2037  1.1  christos 	sub	x28,x28,#8*4
   2038  1.1  christos 	ldp	x19,x20,[x26,#8*0]
   2039  1.1  christos 	sbcs	x13,x22,x17
   2040  1.1  christos 	ldp	x16,x17,[x3,#8*2]
   2041  1.1  christos 	add	x3,x3,#8*4
   2042  1.1  christos 	ldp	x21,x22,[x26,#8*2]
   2043  1.1  christos 	add	x26,x26,#8*4
   2044  1.1  christos 	stp	x10,x11,[x0,#8*0]
   2045  1.1  christos 	sbcs	x10,x19,x14
   2046  1.1  christos 	stp	x12,x13,[x0,#8*2]
   2047  1.1  christos 	add	x0,x0,#8*4
   2048  1.1  christos 	sbcs	x11,x20,x15
   2049  1.1  christos 	cbnz	x28,.Lmul4x_sub
   2050  1.1  christos 
   2051  1.1  christos 	sbcs	x12,x21,x16
   2052  1.1  christos 	mov	x26,sp
   2053  1.1  christos 	add	x1,sp,#8*4
   2054  1.1  christos 	ldp	x6,x7,[x27,#8*0]
   2055  1.1  christos 	sbcs	x13,x22,x17
   2056  1.1  christos 	stp	x10,x11,[x0,#8*0]
   2057  1.1  christos 	ldp	x8,x9,[x27,#8*2]
   2058  1.1  christos 	stp	x12,x13,[x0,#8*2]
   2059  1.1  christos 	ldp	x19,x20,[x1,#8*0]
   2060  1.1  christos 	ldp	x21,x22,[x1,#8*2]
   2061  1.1  christos 	sbcs	xzr,x30,xzr	// did it borrow?
   2062  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   2063  1.1  christos 
   2064  1.1  christos 	sub	x28,x5,#8*4
   2065  1.1  christos .Lmul4x_cond_copy:
   2066  1.1  christos 	sub	x28,x28,#8*4
   2067  1.1  christos 	csel	x10,x19,x6,lo
   2068  1.1  christos 	stp	xzr,xzr,[x26,#8*0]
   2069  1.1  christos 	csel	x11,x20,x7,lo
   2070  1.1  christos 	ldp	x6,x7,[x27,#8*4]
   2071  1.1  christos 	ldp	x19,x20,[x1,#8*4]
   2072  1.1  christos 	csel	x12,x21,x8,lo
   2073  1.1  christos 	stp	xzr,xzr,[x26,#8*2]
   2074  1.1  christos 	add	x26,x26,#8*4
   2075  1.1  christos 	csel	x13,x22,x9,lo
   2076  1.1  christos 	ldp	x8,x9,[x27,#8*6]
   2077  1.1  christos 	ldp	x21,x22,[x1,#8*6]
   2078  1.1  christos 	add	x1,x1,#8*4
   2079  1.1  christos 	stp	x10,x11,[x27,#8*0]
   2080  1.1  christos 	stp	x12,x13,[x27,#8*2]
   2081  1.1  christos 	add	x27,x27,#8*4
   2082  1.1  christos 	cbnz	x28,.Lmul4x_cond_copy
   2083  1.1  christos 
   2084  1.1  christos 	csel	x10,x19,x6,lo
   2085  1.1  christos 	stp	xzr,xzr,[x26,#8*0]
   2086  1.1  christos 	csel	x11,x20,x7,lo
   2087  1.1  christos 	stp	xzr,xzr,[x26,#8*2]
   2088  1.1  christos 	csel	x12,x21,x8,lo
   2089  1.1  christos 	stp	xzr,xzr,[x26,#8*3]
   2090  1.1  christos 	csel	x13,x22,x9,lo
   2091  1.1  christos 	stp	xzr,xzr,[x26,#8*4]
   2092  1.1  christos 	stp	x10,x11,[x27,#8*0]
   2093  1.1  christos 	stp	x12,x13,[x27,#8*2]
   2094  1.1  christos 
   2095  1.1  christos 	b	.Lmul4x_done
   2096  1.1  christos 
   2097  1.1  christos .align	4
   2098  1.1  christos .Lmul4x4_post_condition:
   2099  1.1  christos 	adc	x0,x0,xzr
   2100  1.1  christos 	ldr	x1,[x29,#96]		// pull rp
   2101  1.1  christos 	// x19-3,x0 hold result, x14-7 hold modulus
   2102  1.1  christos 	subs	x6,x19,x14
   2103  1.1  christos 	ldr	x30,[x29,#8]		// pull return address
   2104  1.1  christos 	sbcs	x7,x20,x15
   2105  1.1  christos 	stp	xzr,xzr,[sp,#8*0]
   2106  1.1  christos 	sbcs	x8,x21,x16
   2107  1.1  christos 	stp	xzr,xzr,[sp,#8*2]
   2108  1.1  christos 	sbcs	x9,x22,x17
   2109  1.1  christos 	stp	xzr,xzr,[sp,#8*4]
   2110  1.1  christos 	sbcs	xzr,x0,xzr		// did it borrow?
   2111  1.1  christos 	stp	xzr,xzr,[sp,#8*6]
   2112  1.1  christos 
   2113  1.1  christos 	// x6-3 hold result-modulus
   2114  1.1  christos 	csel	x6,x19,x6,lo
   2115  1.1  christos 	csel	x7,x20,x7,lo
   2116  1.1  christos 	csel	x8,x21,x8,lo
   2117  1.1  christos 	csel	x9,x22,x9,lo
   2118  1.1  christos 	stp	x6,x7,[x1,#8*0]
   2119  1.1  christos 	stp	x8,x9,[x1,#8*2]
   2120  1.1  christos 
   2121  1.1  christos .Lmul4x_done:
   2122  1.1  christos 	ldp	x19,x20,[x29,#16]
   2123  1.1  christos 	mov	sp,x29
   2124  1.1  christos 	ldp	x21,x22,[x29,#32]
   2125  1.1  christos 	mov	x0,#1
   2126  1.1  christos 	ldp	x23,x24,[x29,#48]
   2127  1.1  christos 	ldp	x25,x26,[x29,#64]
   2128  1.1  christos 	ldp	x27,x28,[x29,#80]
   2129  1.1  christos 	ldr	x29,[sp],#128
   2130  1.2  christos 	// x30 loaded earlier
   2131  1.2  christos 	AARCH64_VALIDATE_LINK_REGISTER
   2132  1.1  christos 	ret
   2133  1.1  christos .size	__bn_mul4x_mont,.-__bn_mul4x_mont
   2134  1.2  christos .section	.rodata
   2135  1.1  christos .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   2136  1.1  christos .align	2
   2137  1.1  christos .align	4
   2138