Home | History | Annotate | Line # | Download | only in sse2
      1      1.1  mrg dnl  x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
      2      1.1  mrg 
      3      1.1  mrg dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  Copyright 2011 Free Software Foundation, Inc.
      6  1.1.1.2  mrg 
      7      1.1  mrg dnl  This file is part of the GNU MP Library.
      8      1.1  mrg dnl
      9      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1.1.2  mrg dnl  it under the terms of either:
     11  1.1.1.2  mrg dnl
     12  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1.1.2  mrg dnl      option) any later version.
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl  or
     17  1.1.1.2  mrg dnl
     18  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1.1.2  mrg dnl      later version.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  or both in parallel, as here.
     23      1.1  mrg dnl
     24      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1.1.2  mrg dnl  for more details.
     28      1.1  mrg dnl
     29  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     32      1.1  mrg 
     33      1.1  mrg include(`../config.m4')
     34      1.1  mrg 
     35      1.1  mrg C TODO
     36      1.1  mrg C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
     37      1.1  mrg C    4 large loops into one; we could use it for the outer loop branch.
     38      1.1  mrg C  * Optimise code outside of inner loops.
     39      1.1  mrg C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
     40      1.1  mrg C    outer each loop.  ("Overlapping software pipelining")
     41      1.1  mrg C  * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
     42      1.1  mrg C    all pushes.
     43      1.1  mrg C  * Perhaps write special code for n < M, for some small M.
     44      1.1  mrg C  * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
     45      1.1  mrg C    with even less pipelined code.
     46      1.1  mrg C  * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
     47      1.1  mrg C    Consider breaking out earlier, saving high the cost of short loops.
     48      1.1  mrg 
     49      1.1  mrg C void mpn_sqr_basecase (mp_ptr wp,
     50      1.1  mrg C                        mp_srcptr xp, mp_size_t xn);
     51      1.1  mrg 
     52      1.1  mrg define(`rp',  `%edi')
     53      1.1  mrg define(`up',  `%esi')
     54      1.1  mrg define(`n',   `%ecx')
     55      1.1  mrg 
     56      1.1  mrg define(`un',  `%ebp')
     57      1.1  mrg 
     58      1.1  mrg 	TEXT
     59      1.1  mrg 	ALIGN(16)
     60      1.1  mrg PROLOGUE(mpn_sqr_basecase)
     61      1.1  mrg 	push	%edi
     62      1.1  mrg 	push	%esi
     63      1.1  mrg 	mov	12(%esp), rp
     64      1.1  mrg 	mov	16(%esp), up
     65      1.1  mrg 	mov	20(%esp), n
     66      1.1  mrg 
     67      1.1  mrg 	lea	4(rp), rp	C write triangular product starting at rp[1]
     68      1.1  mrg 	dec	n
     69      1.1  mrg 	movd	(up), %mm7
     70      1.1  mrg 
     71      1.1  mrg 	jz	L(one)
     72      1.1  mrg 	lea	4(up), up
     73      1.1  mrg 	push	%ebx
     74      1.1  mrg 	push	%ebp
     75      1.1  mrg 	mov	n, %eax
     76      1.1  mrg 
     77      1.1  mrg 	movd	(up), %mm0
     78      1.1  mrg 	neg	n
     79      1.1  mrg 	pmuludq	%mm7, %mm0
     80      1.1  mrg 	pxor	%mm6, %mm6
     81      1.1  mrg 	mov	n, un
     82      1.1  mrg 
     83      1.1  mrg 	and	$3, %eax
     84      1.1  mrg 	jz	L(of0)
     85      1.1  mrg 	cmp	$2, %eax
     86      1.1  mrg 	jc	L(of1)
     87      1.1  mrg 	jz	L(of2)
     88      1.1  mrg 
     89      1.1  mrg C ================================================================
     90      1.1  mrg 	jmp	L(m3)
     91      1.1  mrg 	ALIGN(16)
     92      1.1  mrg L(lm3):	movd	-4(up), %mm0
     93      1.1  mrg 	pmuludq	%mm7, %mm0
     94      1.1  mrg 	psrlq	$32, %mm6
     95      1.1  mrg 	lea	16(rp), rp
     96      1.1  mrg 	paddq	%mm0, %mm6
     97      1.1  mrg 	movd	(up), %mm0
     98      1.1  mrg 	pmuludq	%mm7, %mm0
     99      1.1  mrg 	movd	%mm6, -4(rp)
    100      1.1  mrg 	psrlq	$32, %mm6
    101      1.1  mrg L(m3):	paddq	%mm0, %mm6
    102      1.1  mrg 	movd	4(up), %mm0
    103      1.1  mrg 	pmuludq	%mm7, %mm0
    104      1.1  mrg 	movd	%mm6, (rp)
    105      1.1  mrg 	psrlq	$32, %mm6
    106      1.1  mrg 	paddq	%mm0, %mm6
    107      1.1  mrg 	movd	8(up), %mm0
    108      1.1  mrg 	pmuludq	%mm7, %mm0
    109      1.1  mrg 	movd	%mm6, 4(rp)
    110      1.1  mrg 	psrlq	$32, %mm6
    111      1.1  mrg 	paddq	%mm0, %mm6
    112      1.1  mrg 	add	$4, un
    113      1.1  mrg 	movd	%mm6, 8(rp)
    114      1.1  mrg 	lea	16(up), up
    115      1.1  mrg 	js	L(lm3)
    116      1.1  mrg 
    117      1.1  mrg 	psrlq	$32, %mm6
    118      1.1  mrg 	movd	%mm6, 12(rp)
    119      1.1  mrg 
    120      1.1  mrg 	inc	n
    121      1.1  mrg C	jz	L(done)
    122      1.1  mrg   lea	-12(up), up
    123      1.1  mrg   lea	4(rp), rp
    124      1.1  mrg 	jmp	L(ol2)
    125      1.1  mrg 
    126      1.1  mrg C ================================================================
    127      1.1  mrg 	ALIGN(16)
    128      1.1  mrg L(lm0):	movd	(up), %mm0
    129      1.1  mrg 	pmuludq	%mm7, %mm0
    130      1.1  mrg 	psrlq	$32, %mm6
    131      1.1  mrg 	lea	16(rp), rp
    132      1.1  mrg L(of0):	paddq	%mm0, %mm6
    133      1.1  mrg 	movd	4(up), %mm0
    134      1.1  mrg 	pmuludq	%mm7, %mm0
    135      1.1  mrg 	movd	%mm6, (rp)
    136      1.1  mrg 	psrlq	$32, %mm6
    137      1.1  mrg 	paddq	%mm0, %mm6
    138      1.1  mrg 	movd	8(up), %mm0
    139      1.1  mrg 	pmuludq	%mm7, %mm0
    140      1.1  mrg 	movd	%mm6, 4(rp)
    141      1.1  mrg 	psrlq	$32, %mm6
    142      1.1  mrg 	paddq	%mm0, %mm6
    143      1.1  mrg 	movd	12(up), %mm0
    144      1.1  mrg 	pmuludq	%mm7, %mm0
    145      1.1  mrg 	movd	%mm6, 8(rp)
    146      1.1  mrg 	psrlq	$32, %mm6
    147      1.1  mrg 	paddq	%mm0, %mm6
    148      1.1  mrg 	add	$4, un
    149      1.1  mrg 	movd	%mm6, 12(rp)
    150      1.1  mrg 	lea	16(up), up
    151      1.1  mrg 	js	L(lm0)
    152      1.1  mrg 
    153      1.1  mrg 	psrlq	$32, %mm6
    154      1.1  mrg 	movd	%mm6, 16(rp)
    155      1.1  mrg 
    156      1.1  mrg 	inc	n
    157      1.1  mrg C	jz	L(done)
    158      1.1  mrg   lea	-8(up), up
    159      1.1  mrg   lea	8(rp), rp
    160      1.1  mrg 	jmp	L(ol3)
    161      1.1  mrg 
    162      1.1  mrg C ================================================================
    163      1.1  mrg 	ALIGN(16)
    164      1.1  mrg L(lm1):	movd	-12(up), %mm0
    165      1.1  mrg 	pmuludq	%mm7, %mm0
    166      1.1  mrg 	psrlq	$32, %mm6
    167      1.1  mrg 	lea	16(rp), rp
    168      1.1  mrg 	paddq	%mm0, %mm6
    169      1.1  mrg 	movd	-8(up), %mm0
    170      1.1  mrg 	pmuludq	%mm7, %mm0
    171      1.1  mrg 	movd	%mm6, -12(rp)
    172      1.1  mrg 	psrlq	$32, %mm6
    173      1.1  mrg 	paddq	%mm0, %mm6
    174      1.1  mrg 	movd	-4(up), %mm0
    175      1.1  mrg 	pmuludq	%mm7, %mm0
    176      1.1  mrg 	movd	%mm6, -8(rp)
    177      1.1  mrg 	psrlq	$32, %mm6
    178      1.1  mrg 	paddq	%mm0, %mm6
    179      1.1  mrg 	movd	(up), %mm0
    180      1.1  mrg 	pmuludq	%mm7, %mm0
    181      1.1  mrg 	movd	%mm6, -4(rp)
    182      1.1  mrg 	psrlq	$32, %mm6
    183      1.1  mrg L(of1):	paddq	%mm0, %mm6
    184      1.1  mrg 	add	$4, un
    185      1.1  mrg 	movd	%mm6, (rp)
    186      1.1  mrg 	lea	16(up), up
    187      1.1  mrg 	js	L(lm1)
    188      1.1  mrg 
    189      1.1  mrg 	psrlq	$32, %mm6
    190      1.1  mrg 	movd	%mm6, 4(rp)
    191      1.1  mrg 
    192      1.1  mrg 	inc	n
    193      1.1  mrg 	jz	L(done)		C goes away when we add special n=2 code
    194      1.1  mrg   lea	-20(up), up
    195      1.1  mrg   lea	-4(rp), rp
    196      1.1  mrg 	jmp	L(ol0)
    197      1.1  mrg 
    198      1.1  mrg C ================================================================
    199      1.1  mrg 	ALIGN(16)
    200      1.1  mrg L(lm2):	movd	-8(up), %mm0
    201      1.1  mrg 	pmuludq	%mm7, %mm0
    202      1.1  mrg 	psrlq	$32, %mm6
    203      1.1  mrg 	lea	16(rp), rp
    204      1.1  mrg 	paddq	%mm0, %mm6
    205      1.1  mrg 	movd	-4(up), %mm0
    206      1.1  mrg 	pmuludq	%mm7, %mm0
    207      1.1  mrg 	movd	%mm6, -8(rp)
    208      1.1  mrg 	psrlq	$32, %mm6
    209      1.1  mrg 	paddq	%mm0, %mm6
    210      1.1  mrg 	movd	(up), %mm0
    211      1.1  mrg 	pmuludq	%mm7, %mm0
    212      1.1  mrg 	movd	%mm6, -4(rp)
    213      1.1  mrg 	psrlq	$32, %mm6
    214      1.1  mrg L(of2):	paddq	%mm0, %mm6
    215      1.1  mrg 	movd	4(up), %mm0
    216      1.1  mrg 	pmuludq	%mm7, %mm0
    217      1.1  mrg 	movd	%mm6, (rp)
    218      1.1  mrg 	psrlq	$32, %mm6
    219      1.1  mrg 	paddq	%mm0, %mm6
    220      1.1  mrg 	add	$4, un
    221      1.1  mrg 	movd	%mm6, 4(rp)
    222      1.1  mrg 	lea	16(up), up
    223      1.1  mrg 	js	L(lm2)
    224      1.1  mrg 
    225      1.1  mrg 	psrlq	$32, %mm6
    226      1.1  mrg 	movd	%mm6, 8(rp)
    227      1.1  mrg 
    228      1.1  mrg 	inc	n
    229      1.1  mrg C	jz	L(done)
    230      1.1  mrg   lea	-16(up), up
    231      1.1  mrg C  lea	(rp), rp
    232      1.1  mrg C	jmp	L(ol1)
    233      1.1  mrg 
    234      1.1  mrg C ================================================================
    235      1.1  mrg 
    236      1.1  mrg L(ol1):	lea	4(up,n,4), up
    237      1.1  mrg 	movd	(up), %mm7	C read next U invariant limb
    238      1.1  mrg 	lea	8(rp,n,4), rp
    239      1.1  mrg 	mov	n, un
    240      1.1  mrg 
    241      1.1  mrg 	movd	4(up), %mm1
    242      1.1  mrg 	pmuludq	%mm7, %mm1
    243      1.1  mrg 	sar	$2, un
    244      1.1  mrg 	movd	%mm1, %ebx
    245      1.1  mrg 	inc	un
    246      1.1  mrg 	jz	L(re1)
    247      1.1  mrg 
    248      1.1  mrg 	movd	8(up), %mm0
    249      1.1  mrg 	pmuludq	%mm7, %mm0
    250      1.1  mrg 	xor	%edx, %edx	C zero edx and CF
    251      1.1  mrg 	jmp	L(a1)
    252      1.1  mrg 
    253      1.1  mrg L(la1):	adc	$0, %edx
    254      1.1  mrg 	add	%ebx, 12(rp)
    255      1.1  mrg 	movd	%mm0, %eax
    256      1.1  mrg 	pmuludq	%mm7, %mm1
    257      1.1  mrg 	lea	16(rp), rp
    258      1.1  mrg 	psrlq	$32, %mm0
    259      1.1  mrg 	adc	%edx, %eax
    260      1.1  mrg 	movd	%mm0, %edx
    261      1.1  mrg 	movd	%mm1, %ebx
    262      1.1  mrg 	movd	8(up), %mm0
    263      1.1  mrg 	pmuludq	%mm7, %mm0
    264      1.1  mrg 	adc	$0, %edx
    265      1.1  mrg 	add	%eax, (rp)
    266      1.1  mrg L(a1):	psrlq	$32, %mm1
    267      1.1  mrg 	adc	%edx, %ebx
    268      1.1  mrg 	movd	%mm1, %edx
    269      1.1  mrg 	movd	%mm0, %eax
    270      1.1  mrg 	movd	12(up), %mm1
    271      1.1  mrg 	pmuludq	%mm7, %mm1
    272      1.1  mrg 	adc	$0, %edx
    273      1.1  mrg 	add	%ebx, 4(rp)
    274      1.1  mrg 	psrlq	$32, %mm0
    275      1.1  mrg 	adc	%edx, %eax
    276      1.1  mrg 	movd	%mm0, %edx
    277      1.1  mrg 	movd	%mm1, %ebx
    278      1.1  mrg 	lea	16(up), up
    279      1.1  mrg 	movd	(up), %mm0
    280      1.1  mrg 	adc	$0, %edx
    281      1.1  mrg 	add	%eax, 8(rp)
    282      1.1  mrg 	psrlq	$32, %mm1
    283      1.1  mrg 	adc	%edx, %ebx
    284      1.1  mrg 	movd	%mm1, %edx
    285      1.1  mrg 	pmuludq	%mm7, %mm0
    286      1.1  mrg 	inc	un
    287      1.1  mrg 	movd	4(up), %mm1
    288      1.1  mrg 	jnz	L(la1)
    289      1.1  mrg 
    290      1.1  mrg 	adc	un, %edx	C un is zero here
    291      1.1  mrg 	add	%ebx, 12(rp)
    292      1.1  mrg 	movd	%mm0, %eax
    293      1.1  mrg 	pmuludq	%mm7, %mm1
    294      1.1  mrg 	lea	16(rp), rp
    295      1.1  mrg 	psrlq	$32, %mm0
    296      1.1  mrg 	adc	%edx, %eax
    297      1.1  mrg 	movd	%mm0, %edx
    298      1.1  mrg 	movd	%mm1, %ebx
    299      1.1  mrg 	adc	un, %edx
    300      1.1  mrg 	add	%eax, (rp)
    301      1.1  mrg 	psrlq	$32, %mm1
    302      1.1  mrg 	adc	%edx, %ebx
    303      1.1  mrg 	movd	%mm1, %eax
    304      1.1  mrg 	adc	un, %eax
    305      1.1  mrg 	add	%ebx, 4(rp)
    306      1.1  mrg 	adc	un, %eax
    307      1.1  mrg 	mov	%eax, 8(rp)
    308      1.1  mrg 
    309      1.1  mrg 	inc	n
    310      1.1  mrg 
    311      1.1  mrg C ================================================================
    312      1.1  mrg 
    313      1.1  mrg L(ol0):	lea	(up,n,4), up
    314      1.1  mrg 	movd	4(up), %mm7	C read next U invariant limb
    315      1.1  mrg 	lea	4(rp,n,4), rp
    316      1.1  mrg 	mov	n, un
    317      1.1  mrg 
    318      1.1  mrg 	movd	8(up), %mm0
    319      1.1  mrg 	pmuludq	%mm7, %mm0
    320      1.1  mrg 	sar	$2, un
    321      1.1  mrg 	movd	12(up), %mm1
    322      1.1  mrg 	movd	%mm0, %eax
    323      1.1  mrg 	pmuludq	%mm7, %mm1
    324      1.1  mrg 	xor	%edx, %edx	C zero edx and CF
    325      1.1  mrg 	jmp	L(a0)
    326      1.1  mrg 
    327      1.1  mrg L(la0):	adc	$0, %edx
    328      1.1  mrg 	add	%ebx, 12(rp)
    329      1.1  mrg 	movd	%mm0, %eax
    330      1.1  mrg 	pmuludq	%mm7, %mm1
    331      1.1  mrg 	lea	16(rp), rp
    332      1.1  mrg 	psrlq	$32, %mm0
    333      1.1  mrg 	adc	%edx, %eax
    334      1.1  mrg 	movd	%mm0, %edx
    335      1.1  mrg 	movd	%mm1, %ebx
    336      1.1  mrg 	movd	8(up), %mm0
    337      1.1  mrg 	pmuludq	%mm7, %mm0
    338      1.1  mrg 	adc	$0, %edx
    339      1.1  mrg 	add	%eax, (rp)
    340      1.1  mrg 	psrlq	$32, %mm1
    341      1.1  mrg 	adc	%edx, %ebx
    342      1.1  mrg 	movd	%mm1, %edx
    343      1.1  mrg 	movd	%mm0, %eax
    344      1.1  mrg 	movd	12(up), %mm1
    345      1.1  mrg 	pmuludq	%mm7, %mm1
    346      1.1  mrg 	adc	$0, %edx
    347      1.1  mrg 	add	%ebx, 4(rp)
    348      1.1  mrg L(a0):	psrlq	$32, %mm0
    349      1.1  mrg 	adc	%edx, %eax
    350      1.1  mrg 	movd	%mm0, %edx
    351      1.1  mrg 	movd	%mm1, %ebx
    352      1.1  mrg 	lea	16(up), up
    353      1.1  mrg 	movd	(up), %mm0
    354      1.1  mrg 	adc	$0, %edx
    355      1.1  mrg 	add	%eax, 8(rp)
    356      1.1  mrg 	psrlq	$32, %mm1
    357      1.1  mrg 	adc	%edx, %ebx
    358      1.1  mrg 	movd	%mm1, %edx
    359      1.1  mrg 	pmuludq	%mm7, %mm0
    360      1.1  mrg 	inc	un
    361      1.1  mrg 	movd	4(up), %mm1
    362      1.1  mrg 	jnz	L(la0)
    363      1.1  mrg 
    364      1.1  mrg 	adc	un, %edx	C un is zero here
    365      1.1  mrg 	add	%ebx, 12(rp)
    366      1.1  mrg 	movd	%mm0, %eax
    367      1.1  mrg 	pmuludq	%mm7, %mm1
    368      1.1  mrg 	lea	16(rp), rp
    369      1.1  mrg 	psrlq	$32, %mm0
    370      1.1  mrg 	adc	%edx, %eax
    371      1.1  mrg 	movd	%mm0, %edx
    372      1.1  mrg 	movd	%mm1, %ebx
    373      1.1  mrg 	adc	un, %edx
    374      1.1  mrg 	add	%eax, (rp)
    375      1.1  mrg 	psrlq	$32, %mm1
    376      1.1  mrg 	adc	%edx, %ebx
    377      1.1  mrg 	movd	%mm1, %eax
    378      1.1  mrg 	adc	un, %eax
    379      1.1  mrg 	add	%ebx, 4(rp)
    380      1.1  mrg 	adc	un, %eax
    381      1.1  mrg 	mov	%eax, 8(rp)
    382      1.1  mrg 
    383      1.1  mrg 	inc	n
    384      1.1  mrg 
    385      1.1  mrg C ================================================================
    386      1.1  mrg 
    387      1.1  mrg L(ol3):	lea	12(up,n,4), up
    388      1.1  mrg 	movd	-8(up), %mm7	C read next U invariant limb
    389      1.1  mrg 	lea	(rp,n,4), rp	C put rp back
    390      1.1  mrg 	mov	n, un
    391      1.1  mrg 
    392      1.1  mrg 	movd	-4(up), %mm1
    393      1.1  mrg 	pmuludq	%mm7, %mm1
    394      1.1  mrg 	sar	$2, un
    395      1.1  mrg 	movd	%mm1, %ebx
    396      1.1  mrg 	movd	(up), %mm0
    397      1.1  mrg 	xor	%edx, %edx	C zero edx and CF
    398      1.1  mrg 	jmp	L(a3)
    399      1.1  mrg 
    400      1.1  mrg L(la3):	adc	$0, %edx
    401      1.1  mrg 	add	%ebx, 12(rp)
    402      1.1  mrg 	movd	%mm0, %eax
    403      1.1  mrg 	pmuludq	%mm7, %mm1
    404      1.1  mrg 	lea	16(rp), rp
    405      1.1  mrg 	psrlq	$32, %mm0
    406      1.1  mrg 	adc	%edx, %eax
    407      1.1  mrg 	movd	%mm0, %edx
    408      1.1  mrg 	movd	%mm1, %ebx
    409      1.1  mrg 	movd	8(up), %mm0
    410      1.1  mrg 	pmuludq	%mm7, %mm0
    411      1.1  mrg 	adc	$0, %edx
    412      1.1  mrg 	add	%eax, (rp)
    413      1.1  mrg 	psrlq	$32, %mm1
    414      1.1  mrg 	adc	%edx, %ebx
    415      1.1  mrg 	movd	%mm1, %edx
    416      1.1  mrg 	movd	%mm0, %eax
    417      1.1  mrg 	movd	12(up), %mm1
    418      1.1  mrg 	pmuludq	%mm7, %mm1
    419      1.1  mrg 	adc	$0, %edx
    420      1.1  mrg 	add	%ebx, 4(rp)
    421      1.1  mrg 	psrlq	$32, %mm0
    422      1.1  mrg 	adc	%edx, %eax
    423      1.1  mrg 	movd	%mm0, %edx
    424      1.1  mrg 	movd	%mm1, %ebx
    425      1.1  mrg 	lea	16(up), up
    426      1.1  mrg 	movd	(up), %mm0
    427      1.1  mrg 	adc	$0, %edx
    428      1.1  mrg 	add	%eax, 8(rp)
    429      1.1  mrg L(a3):	psrlq	$32, %mm1
    430      1.1  mrg 	adc	%edx, %ebx
    431      1.1  mrg 	movd	%mm1, %edx
    432      1.1  mrg 	pmuludq	%mm7, %mm0
    433      1.1  mrg 	inc	un
    434      1.1  mrg 	movd	4(up), %mm1
    435      1.1  mrg 	jnz	L(la3)
    436      1.1  mrg 
    437      1.1  mrg 	adc	un, %edx	C un is zero here
    438      1.1  mrg 	add	%ebx, 12(rp)
    439      1.1  mrg 	movd	%mm0, %eax
    440      1.1  mrg 	pmuludq	%mm7, %mm1
    441      1.1  mrg 	lea	16(rp), rp
    442      1.1  mrg 	psrlq	$32, %mm0
    443      1.1  mrg 	adc	%edx, %eax
    444      1.1  mrg 	movd	%mm0, %edx
    445      1.1  mrg 	movd	%mm1, %ebx
    446      1.1  mrg 	adc	un, %edx
    447      1.1  mrg 	add	%eax, (rp)
    448      1.1  mrg 	psrlq	$32, %mm1
    449      1.1  mrg 	adc	%edx, %ebx
    450      1.1  mrg 	movd	%mm1, %eax
    451      1.1  mrg 	adc	un, %eax
    452      1.1  mrg 	add	%ebx, 4(rp)
    453      1.1  mrg 	adc	un, %eax
    454      1.1  mrg 	mov	%eax, 8(rp)
    455      1.1  mrg 
    456      1.1  mrg 	inc	n
    457      1.1  mrg 
    458      1.1  mrg C ================================================================
    459      1.1  mrg 
    460      1.1  mrg L(ol2):	lea	8(up,n,4), up
    461      1.1  mrg 	movd	-4(up), %mm7	C read next U invariant limb
    462      1.1  mrg 	lea	12(rp,n,4), rp
    463      1.1  mrg 	mov	n, un
    464      1.1  mrg 
    465      1.1  mrg 	movd	(up), %mm0
    466      1.1  mrg 	pmuludq	%mm7, %mm0
    467      1.1  mrg 	xor	%edx, %edx
    468      1.1  mrg 	sar	$2, un
    469      1.1  mrg 	movd	4(up), %mm1
    470      1.1  mrg 	test	un, un		C clear carry
    471      1.1  mrg 	movd	%mm0, %eax
    472      1.1  mrg 	pmuludq	%mm7, %mm1
    473      1.1  mrg 	inc	un
    474      1.1  mrg 	jnz	L(a2)
    475      1.1  mrg 	jmp	L(re2)
    476      1.1  mrg 
    477      1.1  mrg L(la2):	adc	$0, %edx
    478      1.1  mrg 	add	%ebx, 12(rp)
    479      1.1  mrg 	movd	%mm0, %eax
    480      1.1  mrg 	pmuludq	%mm7, %mm1
    481      1.1  mrg 	lea	16(rp), rp
    482      1.1  mrg L(a2):	psrlq	$32, %mm0
    483      1.1  mrg 	adc	%edx, %eax
    484      1.1  mrg 	movd	%mm0, %edx
    485      1.1  mrg 	movd	%mm1, %ebx
    486      1.1  mrg 	movd	8(up), %mm0
    487      1.1  mrg 	pmuludq	%mm7, %mm0
    488      1.1  mrg 	adc	$0, %edx
    489      1.1  mrg 	add	%eax, (rp)
    490      1.1  mrg 	psrlq	$32, %mm1
    491      1.1  mrg 	adc	%edx, %ebx
    492      1.1  mrg 	movd	%mm1, %edx
    493      1.1  mrg 	movd	%mm0, %eax
    494      1.1  mrg 	movd	12(up), %mm1
    495      1.1  mrg 	pmuludq	%mm7, %mm1
    496      1.1  mrg 	adc	$0, %edx
    497      1.1  mrg 	add	%ebx, 4(rp)
    498      1.1  mrg 	psrlq	$32, %mm0
    499      1.1  mrg 	adc	%edx, %eax
    500      1.1  mrg 	movd	%mm0, %edx
    501      1.1  mrg 	movd	%mm1, %ebx
    502      1.1  mrg 	lea	16(up), up
    503      1.1  mrg 	movd	(up), %mm0
    504      1.1  mrg 	adc	$0, %edx
    505      1.1  mrg 	add	%eax, 8(rp)
    506      1.1  mrg 	psrlq	$32, %mm1
    507      1.1  mrg 	adc	%edx, %ebx
    508      1.1  mrg 	movd	%mm1, %edx
    509      1.1  mrg 	pmuludq	%mm7, %mm0
    510      1.1  mrg 	inc	un
    511      1.1  mrg 	movd	4(up), %mm1
    512      1.1  mrg 	jnz	L(la2)
    513      1.1  mrg 
    514      1.1  mrg 	adc	un, %edx	C un is zero here
    515      1.1  mrg 	add	%ebx, 12(rp)
    516      1.1  mrg 	movd	%mm0, %eax
    517      1.1  mrg 	pmuludq	%mm7, %mm1
    518      1.1  mrg 	lea	16(rp), rp
    519      1.1  mrg 	psrlq	$32, %mm0
    520      1.1  mrg 	adc	%edx, %eax
    521      1.1  mrg 	movd	%mm0, %edx
    522      1.1  mrg 	movd	%mm1, %ebx
    523      1.1  mrg 	adc	un, %edx
    524      1.1  mrg 	add	%eax, (rp)
    525      1.1  mrg 	psrlq	$32, %mm1
    526      1.1  mrg 	adc	%edx, %ebx
    527      1.1  mrg 	movd	%mm1, %eax
    528      1.1  mrg 	adc	un, %eax
    529      1.1  mrg 	add	%ebx, 4(rp)
    530      1.1  mrg 	adc	un, %eax
    531      1.1  mrg 	mov	%eax, 8(rp)
    532      1.1  mrg 
    533      1.1  mrg 	inc	n
    534      1.1  mrg 	jmp	L(ol1)
    535      1.1  mrg 
    536      1.1  mrg C ================================================================
    537      1.1  mrg L(re2):	psrlq	$32, %mm0
    538      1.1  mrg 	movd	(up), %mm7	C read next U invariant limb
    539      1.1  mrg 	adc	%edx, %eax
    540      1.1  mrg 	movd	%mm0, %edx
    541      1.1  mrg 	movd	%mm1, %ebx
    542      1.1  mrg 	adc	un, %edx
    543      1.1  mrg 	add	%eax, (rp)
    544      1.1  mrg 	lea	4(rp), rp
    545      1.1  mrg 	psrlq	$32, %mm1
    546      1.1  mrg 	adc	%edx, %ebx
    547      1.1  mrg 	movd	%mm1, %eax
    548      1.1  mrg 	movd	4(up), %mm1
    549      1.1  mrg 	adc	un, %eax
    550      1.1  mrg 	add	%ebx, (rp)
    551      1.1  mrg 	pmuludq	%mm7, %mm1
    552      1.1  mrg 	adc	un, %eax
    553      1.1  mrg 	mov	%eax, 4(rp)
    554      1.1  mrg 	movd	%mm1, %ebx
    555      1.1  mrg 
    556      1.1  mrg L(re1):	psrlq	$32, %mm1
    557      1.1  mrg 	add	%ebx, 4(rp)
    558      1.1  mrg 	movd	%mm1, %eax
    559      1.1  mrg 	adc	un, %eax
    560      1.1  mrg 	xor	n, n		C make n zeroness assumption below true
    561      1.1  mrg 	mov	%eax, 8(rp)
    562      1.1  mrg 
    563      1.1  mrg L(done):			C n is zero here
    564      1.1  mrg 	mov	24(%esp), up
    565      1.1  mrg 	mov	28(%esp), %eax
    566      1.1  mrg 
    567      1.1  mrg 	movd	(up), %mm0
    568      1.1  mrg 	inc	%eax
    569      1.1  mrg 	pmuludq	%mm0, %mm0
    570      1.1  mrg 	lea	4(up), up
    571      1.1  mrg 	mov	20(%esp), rp
    572      1.1  mrg 	shr	%eax
    573      1.1  mrg 	movd	%mm0, (rp)
    574      1.1  mrg 	psrlq	$32, %mm0
    575      1.1  mrg 	lea	-12(rp), rp
    576      1.1  mrg 	mov	%eax, 28(%esp)
    577      1.1  mrg 	jnc	L(odd)
    578      1.1  mrg 
    579      1.1  mrg 	movd	%mm0, %ebp
    580      1.1  mrg 	movd	(up), %mm0
    581      1.1  mrg 	lea	8(rp), rp
    582      1.1  mrg 	pmuludq	%mm0, %mm0
    583      1.1  mrg 	lea	-4(up), up
    584      1.1  mrg 	add	8(rp), %ebp
    585      1.1  mrg 	movd	%mm0, %edx
    586      1.1  mrg 	adc	12(rp), %edx
    587      1.1  mrg 	rcr	n
    588      1.1  mrg 	jmp	L(ent)
    589      1.1  mrg 
    590      1.1  mrg C	ALIGN(16)		C alignment seems irrelevant
    591      1.1  mrg L(top):	movd	(up), %mm1
    592      1.1  mrg 	adc	n, n
    593      1.1  mrg 	movd	%mm0, %eax
    594      1.1  mrg 	pmuludq	%mm1, %mm1
    595      1.1  mrg 	movd	4(up), %mm0
    596      1.1  mrg 	adc	(rp), %eax
    597      1.1  mrg 	movd	%mm1, %ebx
    598      1.1  mrg 	pmuludq	%mm0, %mm0
    599      1.1  mrg 	psrlq	$32, %mm1
    600      1.1  mrg 	adc	4(rp), %ebx
    601      1.1  mrg 	movd	%mm1, %ebp
    602      1.1  mrg 	movd	%mm0, %edx
    603      1.1  mrg 	adc	8(rp), %ebp
    604      1.1  mrg 	adc	12(rp), %edx
    605      1.1  mrg 	rcr	n		C FIXME: isn't this awfully slow on atom???
    606      1.1  mrg 	adc	%eax, (rp)
    607      1.1  mrg 	adc	%ebx, 4(rp)
    608      1.1  mrg L(ent):	lea	8(up), up
    609      1.1  mrg 	adc	%ebp, 8(rp)
    610      1.1  mrg 	psrlq	$32, %mm0
    611      1.1  mrg 	adc	%edx, 12(rp)
    612      1.1  mrg L(odd):	decl	28(%esp)
    613      1.1  mrg 	lea	16(rp), rp
    614      1.1  mrg 	jnz	L(top)
    615      1.1  mrg 
    616      1.1  mrg L(end):	adc	n, n
    617      1.1  mrg 	movd	%mm0, %eax
    618      1.1  mrg 	adc	n, %eax
    619      1.1  mrg 	mov	%eax, (rp)
    620      1.1  mrg 
    621      1.1  mrg L(rtn):	emms
    622      1.1  mrg 	pop	%ebp
    623      1.1  mrg 	pop	%ebx
    624      1.1  mrg 	pop	%esi
    625      1.1  mrg 	pop	%edi
    626      1.1  mrg 	ret
    627      1.1  mrg 
    628      1.1  mrg L(one):	pmuludq	%mm7, %mm7
    629      1.1  mrg 	movq	%mm7, -4(rp)
    630      1.1  mrg 	emms
    631      1.1  mrg 	pop	%esi
    632      1.1  mrg 	pop	%edi
    633      1.1  mrg 	ret
    634      1.1  mrg EPILOGUE()
    635