Home | History | Annotate | Line # | Download | only in pentium
      1      1.1  mrg dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
     35      1.1  mrg C product at around 20x20 limbs.
     36      1.1  mrg 
     37      1.1  mrg 
     38      1.1  mrg C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
     39      1.1  mrg C
     40      1.1  mrg C Calculate src,size squared, storing the result in dst,2*size.
     41      1.1  mrg C
     42      1.1  mrg C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
     43      1.1  mrg C lot of function call overheads are avoided, especially when the size is
     44      1.1  mrg C small.
     45      1.1  mrg 
     46      1.1  mrg defframe(PARAM_SIZE,12)
     47      1.1  mrg defframe(PARAM_SRC, 8)
     48      1.1  mrg defframe(PARAM_DST, 4)
     49      1.1  mrg 
     50      1.1  mrg 	TEXT
     51      1.1  mrg 	ALIGN(8)
     52      1.1  mrg PROLOGUE(mpn_sqr_basecase)
     53      1.1  mrg deflit(`FRAME',0)
     54      1.1  mrg 
     55      1.1  mrg 	movl	PARAM_SIZE, %edx
     56      1.1  mrg 	movl	PARAM_SRC, %eax
     57      1.1  mrg 
     58      1.1  mrg 	cmpl	$2, %edx
     59      1.1  mrg 	movl	PARAM_DST, %ecx
     60      1.1  mrg 
     61      1.1  mrg 	je	L(two_limbs)
     62      1.1  mrg 
     63      1.1  mrg 	movl	(%eax), %eax
     64      1.1  mrg 	ja	L(three_or_more)
     65      1.1  mrg 
     66      1.1  mrg C -----------------------------------------------------------------------------
     67      1.1  mrg C one limb only
     68      1.1  mrg 	C eax	src
     69      1.1  mrg 	C ebx
     70      1.1  mrg 	C ecx	dst
     71      1.1  mrg 	C edx
     72      1.1  mrg 
     73      1.1  mrg 	mull	%eax
     74      1.1  mrg 
     75      1.1  mrg 	movl	%eax, (%ecx)
     76      1.1  mrg 	movl	%edx, 4(%ecx)
     77      1.1  mrg 
     78      1.1  mrg 	ret
     79      1.1  mrg 
     80      1.1  mrg C -----------------------------------------------------------------------------
     81      1.1  mrg 	ALIGN(8)
     82      1.1  mrg L(two_limbs):
     83      1.1  mrg 	C eax	src
     84      1.1  mrg 	C ebx
     85      1.1  mrg 	C ecx	dst
     86      1.1  mrg 	C edx	size
     87      1.1  mrg 
     88      1.1  mrg 	pushl	%ebp
     89      1.1  mrg 	pushl	%edi
     90      1.1  mrg 
     91      1.1  mrg 	pushl	%esi
     92      1.1  mrg 	pushl	%ebx
     93      1.1  mrg 
     94      1.1  mrg 	movl	%eax, %ebx
     95      1.1  mrg 	movl	(%eax), %eax
     96      1.1  mrg 
     97      1.1  mrg 	mull	%eax		C src[0]^2
     98      1.1  mrg 
     99      1.1  mrg 	movl	%eax, (%ecx)	C dst[0]
    100      1.1  mrg 	movl	%edx, %esi	C dst[1]
    101      1.1  mrg 
    102      1.1  mrg 	movl	4(%ebx), %eax
    103      1.1  mrg 
    104      1.1  mrg 	mull	%eax		C src[1]^2
    105      1.1  mrg 
    106      1.1  mrg 	movl	%eax, %edi	C dst[2]
    107      1.1  mrg 	movl	%edx, %ebp	C dst[3]
    108      1.1  mrg 
    109      1.1  mrg 	movl	(%ebx), %eax
    110      1.1  mrg 
    111      1.1  mrg 	mull	4(%ebx)		C src[0]*src[1]
    112      1.1  mrg 
    113      1.1  mrg 	addl	%eax, %esi
    114      1.1  mrg 	popl	%ebx
    115      1.1  mrg 
    116      1.1  mrg 	adcl	%edx, %edi
    117      1.1  mrg 
    118      1.1  mrg 	adcl	$0, %ebp
    119      1.1  mrg 	addl	%esi, %eax
    120      1.1  mrg 
    121      1.1  mrg 	adcl	%edi, %edx
    122      1.1  mrg 	movl	%eax, 4(%ecx)
    123      1.1  mrg 
    124      1.1  mrg 	adcl	$0, %ebp
    125      1.1  mrg 	popl	%esi
    126      1.1  mrg 
    127      1.1  mrg 	movl	%edx, 8(%ecx)
    128      1.1  mrg 	movl	%ebp, 12(%ecx)
    129      1.1  mrg 
    130      1.1  mrg 	popl	%edi
    131      1.1  mrg 	popl	%ebp
    132      1.1  mrg 
    133      1.1  mrg 	ret
    134      1.1  mrg 
    135      1.1  mrg 
    136      1.1  mrg C -----------------------------------------------------------------------------
    137      1.1  mrg 	ALIGN(8)
    138      1.1  mrg L(three_or_more):
    139      1.1  mrg 	C eax	src low limb
    140      1.1  mrg 	C ebx
    141      1.1  mrg 	C ecx	dst
    142      1.1  mrg 	C edx	size
    143      1.1  mrg 
    144      1.1  mrg 	cmpl	$4, %edx
    145      1.1  mrg 	pushl	%ebx
    146      1.1  mrg deflit(`FRAME',4)
    147      1.1  mrg 
    148      1.1  mrg 	movl	PARAM_SRC, %ebx
    149      1.1  mrg 	jae	L(four_or_more)
    150      1.1  mrg 
    151      1.1  mrg 
    152      1.1  mrg C -----------------------------------------------------------------------------
    153      1.1  mrg C three limbs
    154      1.1  mrg 	C eax	src low limb
    155      1.1  mrg 	C ebx	src
    156      1.1  mrg 	C ecx	dst
    157      1.1  mrg 	C edx	size
    158      1.1  mrg 
    159      1.1  mrg 	pushl	%ebp
    160      1.1  mrg 	pushl	%edi
    161      1.1  mrg 
    162      1.1  mrg 	mull	%eax		C src[0] ^ 2
    163      1.1  mrg 
    164      1.1  mrg 	movl	%eax, (%ecx)
    165      1.1  mrg 	movl	%edx, 4(%ecx)
    166      1.1  mrg 
    167      1.1  mrg 	movl	4(%ebx), %eax
    168      1.1  mrg 	xorl	%ebp, %ebp
    169      1.1  mrg 
    170      1.1  mrg 	mull	%eax		C src[1] ^ 2
    171      1.1  mrg 
    172      1.1  mrg 	movl	%eax, 8(%ecx)
    173      1.1  mrg 	movl	%edx, 12(%ecx)
    174      1.1  mrg 
    175      1.1  mrg 	movl	8(%ebx), %eax
    176      1.1  mrg 	pushl	%esi		C risk of cache bank clash
    177      1.1  mrg 
    178      1.1  mrg 	mull	%eax		C src[2] ^ 2
    179      1.1  mrg 
    180      1.1  mrg 	movl	%eax, 16(%ecx)
    181      1.1  mrg 	movl	%edx, 20(%ecx)
    182      1.1  mrg 
    183      1.1  mrg 	movl	(%ebx), %eax
    184      1.1  mrg 
    185      1.1  mrg 	mull	4(%ebx)		C src[0] * src[1]
    186      1.1  mrg 
    187      1.1  mrg 	movl	%eax, %esi
    188      1.1  mrg 	movl	%edx, %edi
    189      1.1  mrg 
    190      1.1  mrg 	movl	(%ebx), %eax
    191      1.1  mrg 
    192      1.1  mrg 	mull	8(%ebx)		C src[0] * src[2]
    193      1.1  mrg 
    194      1.1  mrg 	addl	%eax, %edi
    195      1.1  mrg 	movl	%edx, %ebp
    196      1.1  mrg 
    197      1.1  mrg 	adcl	$0, %ebp
    198      1.1  mrg 	movl	4(%ebx), %eax
    199      1.1  mrg 
    200      1.1  mrg 	mull	8(%ebx)		C src[1] * src[2]
    201      1.1  mrg 
    202      1.1  mrg 	xorl	%ebx, %ebx
    203      1.1  mrg 	addl	%eax, %ebp
    204      1.1  mrg 
    205      1.1  mrg 	C eax
    206      1.1  mrg 	C ebx	zero, will be dst[5]
    207      1.1  mrg 	C ecx	dst
    208      1.1  mrg 	C edx	dst[4]
    209      1.1  mrg 	C esi	dst[1]
    210      1.1  mrg 	C edi	dst[2]
    211      1.1  mrg 	C ebp	dst[3]
    212      1.1  mrg 
    213      1.1  mrg 	adcl	$0, %edx
    214      1.1  mrg 	addl	%esi, %esi
    215      1.1  mrg 
    216      1.1  mrg 	adcl	%edi, %edi
    217      1.1  mrg 
    218      1.1  mrg 	adcl	%ebp, %ebp
    219      1.1  mrg 
    220      1.1  mrg 	adcl	%edx, %edx
    221      1.1  mrg 	movl	4(%ecx), %eax
    222      1.1  mrg 
    223      1.1  mrg 	adcl	$0, %ebx
    224      1.1  mrg 	addl	%esi, %eax
    225      1.1  mrg 
    226      1.1  mrg 	movl	%eax, 4(%ecx)
    227      1.1  mrg 	movl	8(%ecx), %eax
    228      1.1  mrg 
    229      1.1  mrg 	adcl	%edi, %eax
    230      1.1  mrg 	movl	12(%ecx), %esi
    231      1.1  mrg 
    232      1.1  mrg 	adcl	%ebp, %esi
    233      1.1  mrg 	movl	16(%ecx), %edi
    234      1.1  mrg 
    235      1.1  mrg 	movl	%eax, 8(%ecx)
    236      1.1  mrg 	movl	%esi, 12(%ecx)
    237      1.1  mrg 
    238      1.1  mrg 	adcl	%edx, %edi
    239      1.1  mrg 	popl	%esi
    240      1.1  mrg 
    241      1.1  mrg 	movl	20(%ecx), %eax
    242      1.1  mrg 	movl	%edi, 16(%ecx)
    243      1.1  mrg 
    244      1.1  mrg 	popl	%edi
    245      1.1  mrg 	popl	%ebp
    246      1.1  mrg 
    247      1.1  mrg 	adcl	%ebx, %eax	C no carry out of this
    248      1.1  mrg 	popl	%ebx
    249      1.1  mrg 
    250      1.1  mrg 	movl	%eax, 20(%ecx)
    251      1.1  mrg 
    252      1.1  mrg 	ret
    253      1.1  mrg 
    254      1.1  mrg 
    255      1.1  mrg C -----------------------------------------------------------------------------
    256      1.1  mrg 	ALIGN(8)
    257      1.1  mrg L(four_or_more):
    258      1.1  mrg 	C eax	src low limb
    259      1.1  mrg 	C ebx	src
    260      1.1  mrg 	C ecx	dst
    261      1.1  mrg 	C edx	size
    262      1.1  mrg 	C esi
    263      1.1  mrg 	C edi
    264      1.1  mrg 	C ebp
    265      1.1  mrg 	C
    266      1.1  mrg 	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
    267      1.1  mrg 
    268      1.1  mrg deflit(`FRAME',4)
    269      1.1  mrg 
    270      1.1  mrg 	pushl	%edi
    271      1.1  mrg FRAME_pushl()
    272      1.1  mrg 	pushl	%esi
    273      1.1  mrg FRAME_pushl()
    274      1.1  mrg 
    275      1.1  mrg 	pushl	%ebp
    276      1.1  mrg FRAME_pushl()
    277      1.1  mrg 	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
    278      1.1  mrg 
    279      1.1  mrg 	leal	(%ebx,%edx,4), %esi	C src end
    280      1.1  mrg 	movl	%ebx, %ebp		C src
    281      1.1  mrg 
    282      1.1  mrg 	negl	%edx			C -size
    283      1.1  mrg 	xorl	%ebx, %ebx		C clear carry limb and carry flag
    284      1.1  mrg 
    285      1.1  mrg 	leal	1(%edx), %ecx		C -(size-1)
    286      1.1  mrg 
    287      1.1  mrg L(mul1):
    288      1.1  mrg 	C eax	scratch
    289      1.1  mrg 	C ebx	carry
    290      1.1  mrg 	C ecx	counter, negative
    291      1.1  mrg 	C edx	scratch
    292      1.1  mrg 	C esi	&src[size]
    293      1.1  mrg 	C edi	&dst[size]
    294      1.1  mrg 	C ebp	src
    295      1.1  mrg 
    296      1.1  mrg 	adcl	$0, %ebx
    297      1.1  mrg 	movl	(%esi,%ecx,4), %eax
    298      1.1  mrg 
    299      1.1  mrg 	mull	(%ebp)
    300      1.1  mrg 
    301      1.1  mrg 	addl	%eax, %ebx
    302      1.1  mrg 
    303      1.1  mrg 	movl	%ebx, (%edi,%ecx,4)
    304      1.1  mrg 	incl	%ecx
    305      1.1  mrg 
    306      1.1  mrg 	movl	%edx, %ebx
    307      1.1  mrg 	jnz	L(mul1)
    308      1.1  mrg 
    309      1.1  mrg 
    310      1.1  mrg 	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
    311      1.1  mrg 	C n=1..size-2.
    312      1.1  mrg 	C
    313      1.1  mrg 	C The last two products, which are the end corner of the product
    314      1.1  mrg 	C triangle, are handled separately to save looping overhead.  These
    315      1.1  mrg 	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
    316      1.1  mrg 	C If size is 4 then it's only these that need to be done.
    317      1.1  mrg 	C
    318      1.1  mrg 	C In the outer loop %esi is a constant, and %edi just advances by 1
    319      1.1  mrg 	C limb each time.  The size of the operation decreases by 1 limb
    320      1.1  mrg 	C each time.
    321      1.1  mrg 
    322      1.1  mrg 	C eax
    323      1.1  mrg 	C ebx	carry (needing carry flag added)
    324      1.1  mrg 	C ecx
    325      1.1  mrg 	C edx
    326      1.1  mrg 	C esi	&src[size]
    327      1.1  mrg 	C edi	&dst[size]
    328      1.1  mrg 	C ebp
    329      1.1  mrg 
    330      1.1  mrg 	adcl	$0, %ebx
    331      1.1  mrg 	movl	PARAM_SIZE, %edx
    332      1.1  mrg 
    333      1.1  mrg 	movl	%ebx, (%edi)
    334      1.1  mrg 	subl	$4, %edx
    335      1.1  mrg 
    336      1.1  mrg 	negl	%edx
    337      1.1  mrg 	jz	L(corner)
    338      1.1  mrg 
    339      1.1  mrg 
    340      1.1  mrg L(outer):
    341      1.1  mrg 	C ebx	previous carry limb to store
    342      1.1  mrg 	C edx	outer loop counter (negative)
    343      1.1  mrg 	C esi	&src[size]
    344      1.1  mrg 	C edi	dst, pointing at stored carry limb of previous loop
    345      1.1  mrg 
    346      1.1  mrg 	pushl	%edx			C new outer loop counter
    347      1.1  mrg 	leal	-2(%edx), %ecx
    348      1.1  mrg 
    349      1.1  mrg 	movl	%ebx, (%edi)
    350      1.1  mrg 	addl	$4, %edi
    351      1.1  mrg 
    352      1.1  mrg 	addl	$4, %ebp
    353      1.1  mrg 	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
    354      1.1  mrg 
    355      1.1  mrg L(inner):
    356      1.1  mrg 	C eax	scratch
    357      1.1  mrg 	C ebx	carry (needing carry flag added)
    358      1.1  mrg 	C ecx	counter, negative
    359      1.1  mrg 	C edx	scratch
    360      1.1  mrg 	C esi	&src[size]
    361      1.1  mrg 	C edi	dst end of this addmul
    362      1.1  mrg 	C ebp	&src[j]
    363      1.1  mrg 
    364      1.1  mrg 	adcl	$0, %ebx
    365      1.1  mrg 	movl	(%esi,%ecx,4), %eax
    366      1.1  mrg 
    367      1.1  mrg 	mull	(%ebp)
    368      1.1  mrg 
    369      1.1  mrg 	addl	%ebx, %eax
    370      1.1  mrg 	movl	(%edi,%ecx,4), %ebx
    371      1.1  mrg 
    372      1.1  mrg 	adcl	$0, %edx
    373      1.1  mrg 	addl	%eax, %ebx
    374      1.1  mrg 
    375      1.1  mrg 	movl	%ebx, (%edi,%ecx,4)
    376      1.1  mrg 	incl	%ecx
    377      1.1  mrg 
    378      1.1  mrg 	movl	%edx, %ebx
    379      1.1  mrg 	jnz	L(inner)
    380      1.1  mrg 
    381      1.1  mrg 
    382      1.1  mrg 	adcl	$0, %ebx
    383      1.1  mrg 	popl	%edx		C outer loop counter
    384      1.1  mrg 
    385      1.1  mrg 	incl	%edx
    386      1.1  mrg 	jnz	L(outer)
    387      1.1  mrg 
    388      1.1  mrg 
    389      1.1  mrg 	movl	%ebx, (%edi)
    390      1.1  mrg 
    391      1.1  mrg L(corner):
    392      1.1  mrg 	C esi	&src[size]
    393      1.1  mrg 	C edi	&dst[2*size-4]
    394      1.1  mrg 
    395      1.1  mrg 	movl	-8(%esi), %eax
    396      1.1  mrg 	movl	-4(%edi), %ebx		C risk of data cache bank clash here
    397      1.1  mrg 
    398      1.1  mrg 	mull	-12(%esi)		C src[size-2]*src[size-3]
    399      1.1  mrg 
    400      1.1  mrg 	addl	%eax, %ebx
    401      1.1  mrg 	movl	%edx, %ecx
    402      1.1  mrg 
    403      1.1  mrg 	adcl	$0, %ecx
    404      1.1  mrg 	movl	-4(%esi), %eax
    405      1.1  mrg 
    406      1.1  mrg 	mull	-12(%esi)		C src[size-1]*src[size-3]
    407      1.1  mrg 
    408      1.1  mrg 	addl	%ecx, %eax
    409      1.1  mrg 	movl	(%edi), %ecx
    410      1.1  mrg 
    411      1.1  mrg 	adcl	$0, %edx
    412      1.1  mrg 	movl	%ebx, -4(%edi)
    413      1.1  mrg 
    414      1.1  mrg 	addl	%eax, %ecx
    415      1.1  mrg 	movl	%edx, %ebx
    416      1.1  mrg 
    417      1.1  mrg 	adcl	$0, %ebx
    418      1.1  mrg 	movl	-4(%esi), %eax
    419      1.1  mrg 
    420      1.1  mrg 	mull	-8(%esi)		C src[size-1]*src[size-2]
    421      1.1  mrg 
    422      1.1  mrg 	movl	%ecx, (%edi)
    423      1.1  mrg 	addl	%eax, %ebx
    424      1.1  mrg 
    425      1.1  mrg 	adcl	$0, %edx
    426      1.1  mrg 	movl	PARAM_SIZE, %eax
    427      1.1  mrg 
    428      1.1  mrg 	negl	%eax
    429      1.1  mrg 	movl	%ebx, 4(%edi)
    430      1.1  mrg 
    431      1.1  mrg 	addl	$1, %eax		C -(size-1) and clear carry
    432      1.1  mrg 	movl	%edx, 8(%edi)
    433      1.1  mrg 
    434      1.1  mrg 
    435      1.1  mrg C -----------------------------------------------------------------------------
    436      1.1  mrg C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
    437      1.1  mrg 
    438      1.1  mrg L(lshift):
    439      1.1  mrg 	C eax	counter, negative
    440      1.1  mrg 	C ebx	next limb
    441      1.1  mrg 	C ecx
    442      1.1  mrg 	C edx
    443      1.1  mrg 	C esi
    444      1.1  mrg 	C edi	&dst[2*size-4]
    445      1.1  mrg 	C ebp
    446      1.1  mrg 
    447      1.1  mrg 	movl	12(%edi,%eax,8), %ebx
    448      1.1  mrg 
    449      1.1  mrg 	rcll	%ebx
    450      1.1  mrg 	movl	16(%edi,%eax,8), %ecx
    451      1.1  mrg 
    452      1.1  mrg 	rcll	%ecx
    453      1.1  mrg 	movl	%ebx, 12(%edi,%eax,8)
    454      1.1  mrg 
    455      1.1  mrg 	movl	%ecx, 16(%edi,%eax,8)
    456      1.1  mrg 	incl	%eax
    457      1.1  mrg 
    458      1.1  mrg 	jnz	L(lshift)
    459      1.1  mrg 
    460      1.1  mrg 
    461      1.1  mrg 	adcl	%eax, %eax		C high bit out
    462      1.1  mrg 	movl	PARAM_SRC, %esi
    463      1.1  mrg 
    464      1.1  mrg 	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
    465      1.1  mrg 	movl	%eax, 12(%edi)		C dst most significant limb
    466      1.1  mrg 
    467      1.1  mrg 
    468      1.1  mrg C -----------------------------------------------------------------------------
    469      1.1  mrg C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
    470      1.1  mrg C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
    471      1.1  mrg C low limb of src[0]^2.
    472      1.1  mrg 
    473      1.1  mrg 	movl	(%esi), %eax		C src[0]
    474      1.1  mrg 	leal	(%esi,%ecx,4), %esi	C src end
    475      1.1  mrg 
    476      1.1  mrg 	negl	%ecx
    477      1.1  mrg 
    478      1.1  mrg 	mull	%eax
    479      1.1  mrg 
    480      1.1  mrg 	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
    481      1.1  mrg 	movl	%edx, %ebx
    482      1.1  mrg 
    483      1.1  mrg 	addl	$1, %ecx		C size-1 and clear carry
    484      1.1  mrg 
    485      1.1  mrg L(diag):
    486      1.1  mrg 	C eax	scratch (low product)
    487      1.1  mrg 	C ebx	carry limb
    488      1.1  mrg 	C ecx	counter, negative
    489      1.1  mrg 	C edx	scratch (high product)
    490      1.1  mrg 	C esi	&src[size]
    491      1.1  mrg 	C edi	&dst[2*size-4]
    492      1.1  mrg 	C ebp	scratch (fetched dst limbs)
    493      1.1  mrg 
    494      1.1  mrg 	movl	(%esi,%ecx,4), %eax
    495      1.1  mrg 	adcl	$0, %ebx
    496      1.1  mrg 
    497      1.1  mrg 	mull	%eax
    498      1.1  mrg 
    499      1.1  mrg 	movl	16-4(%edi,%ecx,8), %ebp
    500      1.1  mrg 
    501      1.1  mrg 	addl	%ebp, %ebx
    502      1.1  mrg 	movl	16(%edi,%ecx,8), %ebp
    503      1.1  mrg 
    504      1.1  mrg 	adcl	%eax, %ebp
    505      1.1  mrg 	movl	%ebx, 16-4(%edi,%ecx,8)
    506      1.1  mrg 
    507      1.1  mrg 	movl	%ebp, 16(%edi,%ecx,8)
    508      1.1  mrg 	incl	%ecx
    509      1.1  mrg 
    510      1.1  mrg 	movl	%edx, %ebx
    511      1.1  mrg 	jnz	L(diag)
    512      1.1  mrg 
    513      1.1  mrg 
    514      1.1  mrg 	adcl	$0, %edx
    515      1.1  mrg 	movl	16-4(%edi), %eax	C dst most significant limb
    516      1.1  mrg 
    517      1.1  mrg 	addl	%eax, %edx
    518      1.1  mrg 	popl	%ebp
    519      1.1  mrg 
    520      1.1  mrg 	movl	%edx, 16-4(%edi)
    521      1.1  mrg 	popl	%esi		C risk of cache bank clash
    522      1.1  mrg 
    523      1.1  mrg 	popl	%edi
    524      1.1  mrg 	popl	%ebx
    525      1.1  mrg 
    526      1.1  mrg 	ret
    527      1.1  mrg 
    528      1.1  mrg EPILOGUE()
    529