Home | History | Annotate | Line # | Download | only in bt1
      1  1.1  mrg dnl  X86-64 mpn_redc_1 optimised for AMD bobcat.
      2  1.1  mrg 
      3  1.1  mrg dnl  Contributed to the GNU project by Torbjrn Granlund.
      4  1.1  mrg 
      5  1.1  mrg dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
      6  1.1  mrg 
      7  1.1  mrg dnl  This file is part of the GNU MP Library.
      8  1.1  mrg dnl
      9  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1  mrg dnl  it under the terms of either:
     11  1.1  mrg dnl
     12  1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1  mrg dnl      option) any later version.
     15  1.1  mrg dnl
     16  1.1  mrg dnl  or
     17  1.1  mrg dnl
     18  1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1  mrg dnl      later version.
     21  1.1  mrg dnl
     22  1.1  mrg dnl  or both in parallel, as here.
     23  1.1  mrg dnl
     24  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1  mrg dnl  for more details.
     28  1.1  mrg dnl
     29  1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1  mrg dnl  see https://www.gnu.org/licenses/.
     32  1.1  mrg 
     33  1.1  mrg include(`../config.m4')
     34  1.1  mrg 
     35  1.1  mrg C	     cycles/limb
     36  1.1  mrg C AMD K8,K9	 ?
     37  1.1  mrg C AMD K10	 ?
     38  1.1  mrg C AMD bull	 ?
     39  1.1  mrg C AMD pile	 ?
     40  1.1  mrg C AMD steam	 ?
     41  1.1  mrg C AMD bobcat	 5.0
     42  1.1  mrg C AMD jaguar	 ?
     43  1.1  mrg C Intel P4	 ?
     44  1.1  mrg C Intel core	 ?
     45  1.1  mrg C Intel NHM	 ?
     46  1.1  mrg C Intel SBR	 ?
     47  1.1  mrg C Intel IBR	 ?
     48  1.1  mrg C Intel HWL	 ?
     49  1.1  mrg C Intel BWL	 ?
     50  1.1  mrg C Intel atom	 ?
     51  1.1  mrg C VIA nano	 ?
     52  1.1  mrg 
     53  1.1  mrg C TODO
     54  1.1  mrg C  * Micro-optimise, none performed thus far.
     55  1.1  mrg C  * Consider inlining mpn_add_n.
     56  1.1  mrg C  * Single basecases out before the pushes.
     57  1.1  mrg 
     58  1.1  mrg C When playing with pointers, set this to $2 to fall back to conservative
     59  1.1  mrg C indexing in wind-down code.
     60  1.1  mrg define(`I',`$1')
     61  1.1  mrg 
     62  1.1  mrg define(`rp',          `%rdi')   C rcx
     63  1.1  mrg define(`up',          `%rsi')   C rdx
     64  1.1  mrg define(`mp_param',    `%rdx')   C r8
     65  1.1  mrg define(`n',           `%rcx')   C r9
     66  1.1  mrg define(`u0inv',       `%r8')    C stack
     67  1.1  mrg 
     68  1.1  mrg define(`i',           `%r14')
     69  1.1  mrg define(`j',           `%r15')
     70  1.1  mrg define(`mp',          `%r12')
     71  1.1  mrg define(`q0',          `%r13')
     72  1.1  mrg define(`w0',          `%rbp')
     73  1.1  mrg define(`w1',          `%r9')
     74  1.1  mrg define(`w2',          `%r10')
     75  1.1  mrg define(`w3',          `%r11')
     76  1.1  mrg 
     77  1.1  mrg C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
     78  1.1  mrg 
     79  1.1  mrg ABI_SUPPORT(DOS64)
     80  1.1  mrg ABI_SUPPORT(STD64)
     81  1.1  mrg 
     82  1.1  mrg define(`ALIGNx', `ALIGN(16)')
     83  1.1  mrg 
     84  1.1  mrg ASM_START()
     85  1.1  mrg 	TEXT
     86  1.1  mrg 	ALIGN(32)
     87  1.1  mrg PROLOGUE(mpn_redc_1)
     88  1.1  mrg 	FUNC_ENTRY(4)
     89  1.1  mrg IFDOS(`	mov	56(%rsp), %r8	')
     90  1.1  mrg 	push	%rbx
     91  1.1  mrg 	push	%rbp
     92  1.1  mrg 	push	%r12
     93  1.1  mrg 	push	%r13
     94  1.1  mrg 	push	%r14
     95  1.1  mrg 	push	%r15
     96  1.1  mrg 
     97  1.1  mrg 	mov	(up), q0
     98  1.1  mrg 	mov	n, j			C outer loop induction var
     99  1.1  mrg 	lea	(mp_param,n,8), mp
    100  1.1  mrg 	lea	(up,n,8), up
    101  1.1  mrg 	neg	n
    102  1.1  mrg 	imul	u0inv, q0		C first iteration q0
    103  1.1  mrg 
    104  1.1  mrg 	test	$1, R8(n)
    105  1.1  mrg 	jz	L(bx0)
    106  1.1  mrg 
    107  1.1  mrg L(bx1):	test	$2, R8(n)
    108  1.1  mrg 	jz	L(b3)
    109  1.1  mrg 
    110  1.1  mrg L(b1):	cmp	$-1, R32(n)
    111  1.1  mrg 	jz	L(n1)
    112  1.1  mrg 
    113  1.1  mrg L(otp1):lea	1(n), i
    114  1.1  mrg 	mov	(mp,n,8), %rax
    115  1.1  mrg 	mul	q0
    116  1.1  mrg 	mov	%rax, w2
    117  1.1  mrg 	mov	%rdx, w3
    118  1.1  mrg 	mov	8(mp,n,8), %rax
    119  1.1  mrg 	mul	q0
    120  1.1  mrg 	mov	%rax, %rbx
    121  1.1  mrg 	mov	%rdx, w1
    122  1.1  mrg 	add	(up,n,8), w2
    123  1.1  mrg 	adc	w3, %rbx
    124  1.1  mrg 	adc	$0, w1
    125  1.1  mrg 	mov	16(mp,n,8), %rax
    126  1.1  mrg 	mul	q0
    127  1.1  mrg 	mov	%rax, w2
    128  1.1  mrg 	mov	%rdx, w3
    129  1.1  mrg 	add	8(up,n,8), %rbx
    130  1.1  mrg 	mov	%rbx, 8(up,n,8)
    131  1.1  mrg 	adc	w1, w2
    132  1.1  mrg 	adc	$0, w3
    133  1.1  mrg 	imul	u0inv, %rbx		C next q limb
    134  1.1  mrg 	jmp	L(e1)
    135  1.1  mrg 
    136  1.1  mrg 	ALIGNx
    137  1.1  mrg L(tp1):	add	w0, -16(up,i,8)
    138  1.1  mrg 	adc	w1, w2
    139  1.1  mrg 	adc	$0, w3
    140  1.1  mrg 	mov	(mp,i,8), %rax
    141  1.1  mrg 	mul	q0
    142  1.1  mrg 	mov	%rax, w0
    143  1.1  mrg 	mov	%rdx, w1
    144  1.1  mrg 	add	w2, -8(up,i,8)
    145  1.1  mrg 	adc	w3, w0
    146  1.1  mrg 	adc	$0, w1
    147  1.1  mrg 	mov	8(mp,i,8), %rax
    148  1.1  mrg 	mul	q0
    149  1.1  mrg 	mov	%rax, w2
    150  1.1  mrg 	mov	%rdx, w3
    151  1.1  mrg 	add	w0, (up,i,8)
    152  1.1  mrg 	adc	w1, w2
    153  1.1  mrg 	adc	$0, w3
    154  1.1  mrg L(e1):	mov	16(mp,i,8), %rax
    155  1.1  mrg 	mul	q0
    156  1.1  mrg 	mov	%rax, w0
    157  1.1  mrg 	mov	%rdx, w1
    158  1.1  mrg 	add	w2, 8(up,i,8)
    159  1.1  mrg 	adc	w3, w0
    160  1.1  mrg 	adc	$0, w1
    161  1.1  mrg 	mov	24(mp,i,8), %rax
    162  1.1  mrg 	mul	q0
    163  1.1  mrg 	mov	%rax, w2
    164  1.1  mrg 	mov	%rdx, w3
    165  1.1  mrg 	add	$4, i
    166  1.1  mrg 	js	L(tp1)
    167  1.1  mrg 
    168  1.1  mrg L(ed1):	add	w0, I(-16(up),-16(up,i,8))
    169  1.1  mrg 	adc	w1, w2
    170  1.1  mrg 	adc	$0, w3
    171  1.1  mrg 	add	w2, I(-8(up),-8(up,i,8))
    172  1.1  mrg 	adc	$0, w3
    173  1.1  mrg 	mov	w3, (up,n,8)		C up[0]
    174  1.1  mrg 	mov	%rbx, q0		C previously computed q limb -> q0
    175  1.1  mrg 	lea	8(up), up		C up++
    176  1.1  mrg 	dec	j
    177  1.1  mrg 	jnz	L(otp1)
    178  1.1  mrg 	jmp	L(cj)
    179  1.1  mrg 
    180  1.1  mrg L(b3):	cmp	$-3, R32(n)
    181  1.1  mrg 	jz	L(n3)
    182  1.1  mrg 
    183  1.1  mrg L(otp3):lea	3(n), i
    184  1.1  mrg 	mov	(mp,n,8), %rax
    185  1.1  mrg 	mul	q0
    186  1.1  mrg 	mov	%rax, w2
    187  1.1  mrg 	mov	%rdx, w3
    188  1.1  mrg 	mov	8(mp,n,8), %rax
    189  1.1  mrg 	mul	q0
    190  1.1  mrg 	mov	%rax, %rbx
    191  1.1  mrg 	mov	%rdx, w1
    192  1.1  mrg 	add	(up,n,8), w2
    193  1.1  mrg 	adc	w3, %rbx
    194  1.1  mrg 	adc	$0, w1
    195  1.1  mrg 	mov	16(mp,n,8), %rax
    196  1.1  mrg 	mul	q0
    197  1.1  mrg 	mov	%rax, w2
    198  1.1  mrg 	mov	%rdx, w3
    199  1.1  mrg 	add	8(up,n,8), %rbx
    200  1.1  mrg 	mov	%rbx, 8(up,n,8)
    201  1.1  mrg 	adc	w1, w2
    202  1.1  mrg 	adc	$0, w3
    203  1.1  mrg 	imul	u0inv, %rbx		C next q limb
    204  1.1  mrg 	jmp	L(e3)
    205  1.1  mrg 
    206  1.1  mrg 	ALIGNx
    207  1.1  mrg L(tp3):	add	w0, -16(up,i,8)
    208  1.1  mrg 	adc	w1, w2
    209  1.1  mrg 	adc	$0, w3
    210  1.1  mrg L(e3):	mov	(mp,i,8), %rax
    211  1.1  mrg 	mul	q0
    212  1.1  mrg 	mov	%rax, w0
    213  1.1  mrg 	mov	%rdx, w1
    214  1.1  mrg 	add	w2, -8(up,i,8)
    215  1.1  mrg 	adc	w3, w0
    216  1.1  mrg 	adc	$0, w1
    217  1.1  mrg 	mov	8(mp,i,8), %rax
    218  1.1  mrg 	mul	q0
    219  1.1  mrg 	mov	%rax, w2
    220  1.1  mrg 	mov	%rdx, w3
    221  1.1  mrg 	add	w0, (up,i,8)
    222  1.1  mrg 	adc	w1, w2
    223  1.1  mrg 	adc	$0, w3
    224  1.1  mrg 	mov	16(mp,i,8), %rax
    225  1.1  mrg 	mul	q0
    226  1.1  mrg 	mov	%rax, w0
    227  1.1  mrg 	mov	%rdx, w1
    228  1.1  mrg 	add	w2, 8(up,i,8)
    229  1.1  mrg 	adc	w3, w0
    230  1.1  mrg 	adc	$0, w1
    231  1.1  mrg 	mov	24(mp,i,8), %rax
    232  1.1  mrg 	mul	q0
    233  1.1  mrg 	mov	%rax, w2
    234  1.1  mrg 	mov	%rdx, w3
    235  1.1  mrg 	add	$4, i
    236  1.1  mrg 	js	L(tp3)
    237  1.1  mrg 
    238  1.1  mrg L(ed3):	add	w0, I(-16(up),-16(up,i,8))
    239  1.1  mrg 	adc	w1, w2
    240  1.1  mrg 	adc	$0, w3
    241  1.1  mrg 	add	w2, I(-8(up),-8(up,i,8))
    242  1.1  mrg 	adc	$0, w3
    243  1.1  mrg 	mov	w3, (up,n,8)		C up[0]
    244  1.1  mrg 	mov	%rbx, q0		C previously computed q limb -> q0
    245  1.1  mrg 	lea	8(up), up		C up++
    246  1.1  mrg 	dec	j
    247  1.1  mrg 	jnz	L(otp3)
    248  1.1  mrg C	jmp	L(cj)
    249  1.1  mrg 
    250  1.1  mrg L(cj):
    251  1.1  mrg IFSTD(`	lea	(up,n,8), up		C param 2: up
    252  1.1  mrg 	lea	(up,n,8), %rdx		C param 3: up - n
    253  1.1  mrg 	neg	R32(n)		')	C param 4: n
    254  1.1  mrg 
    255  1.1  mrg IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
    256  1.1  mrg 	lea	(%rdx,n,8), %r8		C param 3: up - n
    257  1.1  mrg 	neg	R32(n)
    258  1.1  mrg 	mov	n, %r9			C param 4: n
    259  1.1  mrg 	mov	rp, %rcx	')	C param 1: rp
    260  1.1  mrg 
    261  1.1  mrg IFSTD(`	sub	$8, %rsp	')
    262  1.1  mrg IFDOS(`	sub	$40, %rsp	')
    263  1.1  mrg 	ASSERT(nz, `test $15, %rsp')
    264  1.1  mrg 	CALL(	mpn_add_n)
    265  1.1  mrg IFSTD(`	add	$8, %rsp	')
    266  1.1  mrg IFDOS(`	add	$40, %rsp	')
    267  1.1  mrg 
    268  1.1  mrg L(ret):	pop	%r15
    269  1.1  mrg 	pop	%r14
    270  1.1  mrg 	pop	%r13
    271  1.1  mrg 	pop	%r12
    272  1.1  mrg 	pop	%rbp
    273  1.1  mrg 	pop	%rbx
    274  1.1  mrg 	FUNC_EXIT()
    275  1.1  mrg 	ret
    276  1.1  mrg 
    277  1.1  mrg L(bx0):	test	$2, R8(n)
    278  1.1  mrg 	jnz	L(b2)
    279  1.1  mrg 
    280  1.1  mrg L(b0):
    281  1.1  mrg L(otp0):lea	(n), i
    282  1.1  mrg 	mov	(mp,n,8), %rax
    283  1.1  mrg 	mul	q0
    284  1.1  mrg 	mov	%rax, w0
    285  1.1  mrg 	mov	%rdx, w1
    286  1.1  mrg 	mov	8(mp,n,8), %rax
    287  1.1  mrg 	mul	q0
    288  1.1  mrg 	mov	%rax, %rbx
    289  1.1  mrg 	mov	%rdx, w3
    290  1.1  mrg 	add	(up,n,8), w0
    291  1.1  mrg 	adc	w1, %rbx
    292  1.1  mrg 	adc	$0, w3
    293  1.1  mrg 	mov	16(mp,n,8), %rax
    294  1.1  mrg 	mul	q0
    295  1.1  mrg 	mov	%rax, w0
    296  1.1  mrg 	mov	%rdx, w1
    297  1.1  mrg 	add	8(up,n,8), %rbx
    298  1.1  mrg 	mov	%rbx, 8(up,n,8)
    299  1.1  mrg 	adc	w3, w0
    300  1.1  mrg 	adc	$0, w1
    301  1.1  mrg 	imul	u0inv, %rbx		C next q limb
    302  1.1  mrg 	jmp	L(e0)
    303  1.1  mrg 
    304  1.1  mrg 	ALIGNx
    305  1.1  mrg L(tp0):	add	w0, -16(up,i,8)
    306  1.1  mrg 	adc	w1, w2
    307  1.1  mrg 	adc	$0, w3
    308  1.1  mrg 	mov	(mp,i,8), %rax
    309  1.1  mrg 	mul	q0
    310  1.1  mrg 	mov	%rax, w0
    311  1.1  mrg 	mov	%rdx, w1
    312  1.1  mrg 	add	w2, -8(up,i,8)
    313  1.1  mrg 	adc	w3, w0
    314  1.1  mrg 	adc	$0, w1
    315  1.1  mrg 	mov	8(mp,i,8), %rax
    316  1.1  mrg 	mul	q0
    317  1.1  mrg 	mov	%rax, w2
    318  1.1  mrg 	mov	%rdx, w3
    319  1.1  mrg 	add	w0, (up,i,8)
    320  1.1  mrg 	adc	w1, w2
    321  1.1  mrg 	adc	$0, w3
    322  1.1  mrg 	mov	16(mp,i,8), %rax
    323  1.1  mrg 	mul	q0
    324  1.1  mrg 	mov	%rax, w0
    325  1.1  mrg 	mov	%rdx, w1
    326  1.1  mrg 	add	w2, 8(up,i,8)
    327  1.1  mrg 	adc	w3, w0
    328  1.1  mrg 	adc	$0, w1
    329  1.1  mrg L(e0):	mov	24(mp,i,8), %rax
    330  1.1  mrg 	mul	q0
    331  1.1  mrg 	mov	%rax, w2
    332  1.1  mrg 	mov	%rdx, w3
    333  1.1  mrg 	add	$4, i
    334  1.1  mrg 	js	L(tp0)
    335  1.1  mrg 
    336  1.1  mrg L(ed0):	add	w0, I(-16(up),-16(up,i,8))
    337  1.1  mrg 	adc	w1, w2
    338  1.1  mrg 	adc	$0, w3
    339  1.1  mrg 	add	w2, I(-8(up),-8(up,i,8))
    340  1.1  mrg 	adc	$0, w3
    341  1.1  mrg 	mov	w3, (up,n,8)		C up[0]
    342  1.1  mrg 	mov	%rbx, q0		C previously computed q limb -> q0
    343  1.1  mrg 	lea	8(up), up		C up++
    344  1.1  mrg 	dec	j
    345  1.1  mrg 	jnz	L(otp0)
    346  1.1  mrg 	jmp	L(cj)
    347  1.1  mrg 
    348  1.1  mrg L(b2):	cmp	$-2, R32(n)
    349  1.1  mrg 	jz	L(n2)
    350  1.1  mrg 
    351  1.1  mrg L(otp2):lea	2(n), i
    352  1.1  mrg 	mov	(mp,n,8), %rax
    353  1.1  mrg 	mul	q0
    354  1.1  mrg 	mov	%rax, w0
    355  1.1  mrg 	mov	%rdx, w1
    356  1.1  mrg 	mov	8(mp,n,8), %rax
    357  1.1  mrg 	mul	q0
    358  1.1  mrg 	mov	%rax, %rbx
    359  1.1  mrg 	mov	%rdx, w3
    360  1.1  mrg 	add	(up,n,8), w0
    361  1.1  mrg 	adc	w1, %rbx
    362  1.1  mrg 	adc	$0, w3
    363  1.1  mrg 	mov	16(mp,n,8), %rax
    364  1.1  mrg 	mul	q0
    365  1.1  mrg 	mov	%rax, w0
    366  1.1  mrg 	mov	%rdx, w1
    367  1.1  mrg 	add	8(up,n,8), %rbx
    368  1.1  mrg 	mov	%rbx, 8(up,n,8)
    369  1.1  mrg 	adc	w3, w0
    370  1.1  mrg 	adc	$0, w1
    371  1.1  mrg 	imul	u0inv, %rbx		C next q limb
    372  1.1  mrg 	jmp	L(e2)
    373  1.1  mrg 
    374  1.1  mrg 	ALIGNx
    375  1.1  mrg L(tp2):	add	w0, -16(up,i,8)
    376  1.1  mrg 	adc	w1, w2
    377  1.1  mrg 	adc	$0, w3
    378  1.1  mrg 	mov	(mp,i,8), %rax
    379  1.1  mrg 	mul	q0
    380  1.1  mrg 	mov	%rax, w0
    381  1.1  mrg 	mov	%rdx, w1
    382  1.1  mrg 	add	w2, -8(up,i,8)
    383  1.1  mrg 	adc	w3, w0
    384  1.1  mrg 	adc	$0, w1
    385  1.1  mrg L(e2):	mov	8(mp,i,8), %rax
    386  1.1  mrg 	mul	q0
    387  1.1  mrg 	mov	%rax, w2
    388  1.1  mrg 	mov	%rdx, w3
    389  1.1  mrg 	add	w0, (up,i,8)
    390  1.1  mrg 	adc	w1, w2
    391  1.1  mrg 	adc	$0, w3
    392  1.1  mrg 	mov	16(mp,i,8), %rax
    393  1.1  mrg 	mul	q0
    394  1.1  mrg 	mov	%rax, w0
    395  1.1  mrg 	mov	%rdx, w1
    396  1.1  mrg 	add	w2, 8(up,i,8)
    397  1.1  mrg 	adc	w3, w0
    398  1.1  mrg 	adc	$0, w1
    399  1.1  mrg 	mov	24(mp,i,8), %rax
    400  1.1  mrg 	mul	q0
    401  1.1  mrg 	mov	%rax, w2
    402  1.1  mrg 	mov	%rdx, w3
    403  1.1  mrg 	add	$4, i
    404  1.1  mrg 	js	L(tp2)
    405  1.1  mrg 
    406  1.1  mrg L(ed2):	add	w0, I(-16(up),-16(up,i,8))
    407  1.1  mrg 	adc	w1, w2
    408  1.1  mrg 	adc	$0, w3
    409  1.1  mrg 	add	w2, I(-8(up),-8(up,i,8))
    410  1.1  mrg 	adc	$0, w3
    411  1.1  mrg 	mov	w3, (up,n,8)		C up[0]
    412  1.1  mrg 	mov	%rbx, q0		C previously computed q limb -> q0
    413  1.1  mrg 	lea	8(up), up		C up++
    414  1.1  mrg 	dec	j
    415  1.1  mrg 	jnz	L(otp2)
    416  1.1  mrg 	jmp	L(cj)
    417  1.1  mrg 
    418  1.1  mrg L(n1):	mov	(mp_param), %rax
    419  1.1  mrg 	mul	q0
    420  1.1  mrg 	add	-8(up), %rax
    421  1.1  mrg 	adc	(up), %rdx
    422  1.1  mrg 	mov	%rdx, (rp)
    423  1.1  mrg 	mov	$0, R32(%rax)
    424  1.1  mrg 	adc	R32(%rax), R32(%rax)
    425  1.1  mrg 	jmp	L(ret)
    426  1.1  mrg 
    427  1.1  mrg L(n2):	mov	(mp_param), %rax
    428  1.1  mrg 	mov	-16(up), %rbp
    429  1.1  mrg 	mul	q0
    430  1.1  mrg 	add	%rax, %rbp
    431  1.1  mrg 	mov	%rdx, %r9
    432  1.1  mrg 	adc	$0, %r9
    433  1.1  mrg 	mov	-8(mp), %rax
    434  1.1  mrg 	mov	-8(up), %r10
    435  1.1  mrg 	mul	q0
    436  1.1  mrg 	add	%rax, %r10
    437  1.1  mrg 	mov	%rdx, %r11
    438  1.1  mrg 	adc	$0, %r11
    439  1.1  mrg 	add	%r9, %r10
    440  1.1  mrg 	adc	$0, %r11
    441  1.1  mrg 	mov	%r10, q0
    442  1.1  mrg 	imul	u0inv, q0		C next q0
    443  1.1  mrg 	mov	-16(mp), %rax
    444  1.1  mrg 	mul	q0
    445  1.1  mrg 	add	%rax, %r10
    446  1.1  mrg 	mov	%rdx, %r9
    447  1.1  mrg 	adc	$0, %r9
    448  1.1  mrg 	mov	-8(mp), %rax
    449  1.1  mrg 	mov	(up), %r14
    450  1.1  mrg 	mul	q0
    451  1.1  mrg 	add	%rax, %r14
    452  1.1  mrg 	adc	$0, %rdx
    453  1.1  mrg 	add	%r9, %r14
    454  1.1  mrg 	adc	$0, %rdx
    455  1.1  mrg 	xor	R32(%rax), R32(%rax)
    456  1.1  mrg 	add	%r11, %r14
    457  1.1  mrg 	adc	8(up), %rdx
    458  1.1  mrg 	mov	%r14, (rp)
    459  1.1  mrg 	mov	%rdx, 8(rp)
    460  1.1  mrg 	adc	R32(%rax), R32(%rax)
    461  1.1  mrg 	jmp	L(ret)
    462  1.1  mrg 
    463  1.1  mrg 	ALIGNx
    464  1.1  mrg L(n3):	mov	-24(mp), %rax
    465  1.1  mrg 	mov	-24(up), %r10
    466  1.1  mrg 	mul	q0
    467  1.1  mrg 	add	%rax, %r10
    468  1.1  mrg 	mov	-16(mp), %rax
    469  1.1  mrg 	mov	%rdx, %r11
    470  1.1  mrg 	adc	$0, %r11
    471  1.1  mrg 	mov	-16(up), %rbp
    472  1.1  mrg 	mul	q0
    473  1.1  mrg 	add	%rax, %rbp
    474  1.1  mrg 	mov	%rdx, %r9
    475  1.1  mrg 	adc	$0, %r9
    476  1.1  mrg 	mov	-8(mp), %rax
    477  1.1  mrg 	add	%r11, %rbp
    478  1.1  mrg 	mov	-8(up), %r10
    479  1.1  mrg 	adc	$0, %r9
    480  1.1  mrg 	mul	q0
    481  1.1  mrg 	mov	%rbp, q0
    482  1.1  mrg 	imul	u0inv, q0		C next q0
    483  1.1  mrg 	add	%rax, %r10
    484  1.1  mrg 	mov	%rdx, %r11
    485  1.1  mrg 	adc	$0, %r11
    486  1.1  mrg 	mov	%rbp, -16(up)
    487  1.1  mrg 	add	%r9, %r10
    488  1.1  mrg 	adc	$0, %r11
    489  1.1  mrg 	mov	%r10, -8(up)
    490  1.1  mrg 	mov	%r11, -24(up)		C up[0]
    491  1.1  mrg 	lea	8(up), up		C up++
    492  1.1  mrg 	dec	j
    493  1.1  mrg 	jnz	L(n3)
    494  1.1  mrg 
    495  1.1  mrg 	mov	-48(up), %rdx
    496  1.1  mrg 	mov	-40(up), %rbx
    497  1.1  mrg 	xor	R32(%rax), R32(%rax)
    498  1.1  mrg 	add	%rbp, %rdx
    499  1.1  mrg 	adc	%r10, %rbx
    500  1.1  mrg 	adc	-8(up), %r11
    501  1.1  mrg 	mov	%rdx, (rp)
    502  1.1  mrg 	mov	%rbx, 8(rp)
    503  1.1  mrg 	mov	%r11, 16(rp)
    504  1.1  mrg 	adc	R32(%rax), R32(%rax)
    505  1.1  mrg 	jmp	L(ret)
    506  1.1  mrg EPILOGUE()
    507  1.1  mrg ASM_END()
    508