Home | History | Annotate | Line # | Download | only in k7
      1      1.1  mrg dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
     35      1.1  mrg C     limbs/loop unrolling).
     36      1.1  mrg 
     37      1.1  mrg 
     38      1.1  mrg 
     39      1.1  mrg dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
     40      1.1  mrg dnl           8           4.67
     41      1.1  mrg dnl          16           4.59
     42      1.1  mrg dnl          32           4.42
     43      1.1  mrg dnl  Maximum possible with the current code is 32.
     44      1.1  mrg dnl
     45      1.1  mrg dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
     46      1.1  mrg dnl  done with a straight run through a block of code, no inner loop.  Using
     47      1.1  mrg dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
     48      1.1  mrg 
     49      1.1  mrg deflit(UNROLL_COUNT, 32)
     50      1.1  mrg 
     51      1.1  mrg 
     52      1.1  mrg C void mpn_mul_basecase (mp_ptr wp,
     53      1.1  mrg C                        mp_srcptr xp, mp_size_t xsize,
     54      1.1  mrg C                        mp_srcptr yp, mp_size_t ysize);
     55      1.1  mrg C
     56      1.1  mrg C Calculate xp,xsize multiplied by yp,ysize, storing the result in
     57      1.1  mrg C wp,xsize+ysize.
     58      1.1  mrg C
     59      1.1  mrg C This routine is essentially the same as mpn/generic/mul_basecase.c, but
     60      1.1  mrg C it's faster because it does most of the mpn_addmul_1() startup
     61      1.1  mrg C calculations only once.  The saving is 15-25% on typical sizes coming from
     62      1.1  mrg C the Karatsuba multiply code.
     63      1.1  mrg 
     64      1.1  mrg ifdef(`PIC',`
     65      1.1  mrg deflit(UNROLL_THRESHOLD, 5)
     66      1.1  mrg ',`
     67      1.1  mrg deflit(UNROLL_THRESHOLD, 5)
     68      1.1  mrg ')
     69      1.1  mrg 
     70      1.1  mrg defframe(PARAM_YSIZE,20)
     71      1.1  mrg defframe(PARAM_YP,   16)
     72      1.1  mrg defframe(PARAM_XSIZE,12)
     73      1.1  mrg defframe(PARAM_XP,   8)
     74      1.1  mrg defframe(PARAM_WP,   4)
     75      1.1  mrg 
     76      1.1  mrg 	TEXT
     77      1.1  mrg 	ALIGN(32)
     78      1.1  mrg PROLOGUE(mpn_mul_basecase)
     79      1.1  mrg deflit(`FRAME',0)
     80      1.1  mrg 
     81      1.1  mrg 	movl	PARAM_XSIZE, %ecx
     82      1.1  mrg 	movl	PARAM_YP, %eax
     83      1.1  mrg 
     84      1.1  mrg 	movl	PARAM_XP, %edx
     85      1.1  mrg 	movl	(%eax), %eax	C yp low limb
     86      1.1  mrg 
     87      1.1  mrg 	cmpl	$2, %ecx
     88      1.1  mrg 	ja	L(xsize_more_than_two)
     89      1.1  mrg 	je	L(two_by_something)
     90      1.1  mrg 
     91      1.1  mrg 
     92      1.1  mrg 	C one limb by one limb
     93      1.1  mrg 
     94      1.1  mrg 	mull	(%edx)
     95      1.1  mrg 
     96      1.1  mrg 	movl	PARAM_WP, %ecx
     97      1.1  mrg 	movl	%eax, (%ecx)
     98      1.1  mrg 	movl	%edx, 4(%ecx)
     99      1.1  mrg 	ret
    100      1.1  mrg 
    101      1.1  mrg 
    102      1.1  mrg C -----------------------------------------------------------------------------
    103      1.1  mrg L(two_by_something):
    104      1.1  mrg deflit(`FRAME',0)
    105      1.1  mrg 	decl	PARAM_YSIZE
    106      1.1  mrg 	pushl	%ebx		defframe_pushl(`SAVE_EBX')
    107      1.1  mrg 	movl	%eax, %ecx	C yp low limb
    108      1.1  mrg 
    109      1.1  mrg 	movl	PARAM_WP, %ebx
    110      1.1  mrg 	pushl	%esi		defframe_pushl(`SAVE_ESI')
    111      1.1  mrg 	movl	%edx, %esi	C xp
    112      1.1  mrg 
    113      1.1  mrg 	movl	(%edx), %eax	C xp low limb
    114      1.1  mrg 	jnz	L(two_by_two)
    115      1.1  mrg 
    116      1.1  mrg 
    117      1.1  mrg 	C two limbs by one limb
    118      1.1  mrg 
    119      1.1  mrg 	mull	%ecx
    120      1.1  mrg 
    121      1.1  mrg 	movl	%eax, (%ebx)
    122      1.1  mrg 	movl	4(%esi), %eax
    123      1.1  mrg 	movl	%edx, %esi	C carry
    124      1.1  mrg 
    125      1.1  mrg 	mull	%ecx
    126      1.1  mrg 
    127      1.1  mrg 	addl	%eax, %esi
    128      1.1  mrg 
    129      1.1  mrg 	movl	%esi, 4(%ebx)
    130      1.1  mrg 	movl	SAVE_ESI, %esi
    131      1.1  mrg 
    132      1.1  mrg 	adcl	$0, %edx
    133      1.1  mrg 
    134      1.1  mrg 	movl	%edx, 8(%ebx)
    135      1.1  mrg 	movl	SAVE_EBX, %ebx
    136      1.1  mrg 	addl	$FRAME, %esp
    137      1.1  mrg 
    138      1.1  mrg 	ret
    139      1.1  mrg 
    140      1.1  mrg 
    141      1.1  mrg 
    142      1.1  mrg C -----------------------------------------------------------------------------
    143      1.1  mrg C Could load yp earlier into another register.
    144      1.1  mrg 
    145      1.1  mrg 	ALIGN(16)
    146      1.1  mrg L(two_by_two):
    147      1.1  mrg 	C eax	xp low limb
    148      1.1  mrg 	C ebx	wp
    149      1.1  mrg 	C ecx	yp low limb
    150      1.1  mrg 	C edx
    151      1.1  mrg 	C esi	xp
    152      1.1  mrg 	C edi
    153      1.1  mrg 	C ebp
    154      1.1  mrg 
    155      1.1  mrg dnl  FRAME carries on from previous
    156      1.1  mrg 
    157      1.1  mrg 	mull	%ecx		C xp[0] * yp[0]
    158      1.1  mrg 
    159      1.1  mrg 	push	%edi		defframe_pushl(`SAVE_EDI')
    160      1.1  mrg 	movl	%edx, %edi	C carry, for wp[1]
    161      1.1  mrg 
    162      1.1  mrg 	movl	%eax, (%ebx)
    163      1.1  mrg 	movl	4(%esi), %eax
    164      1.1  mrg 
    165      1.1  mrg 	mull	%ecx		C xp[1] * yp[0]
    166      1.1  mrg 
    167      1.1  mrg 	addl	%eax, %edi
    168      1.1  mrg 	movl	PARAM_YP, %ecx
    169      1.1  mrg 
    170      1.1  mrg 	adcl	$0, %edx
    171      1.1  mrg 	movl	4(%ecx), %ecx	C yp[1]
    172      1.1  mrg 	movl	%edi, 4(%ebx)
    173      1.1  mrg 
    174      1.1  mrg 	movl	4(%esi), %eax	C xp[1]
    175      1.1  mrg 	movl	%edx, %edi	C carry, for wp[2]
    176      1.1  mrg 
    177      1.1  mrg 	mull	%ecx		C xp[1] * yp[1]
    178      1.1  mrg 
    179      1.1  mrg 	addl	%eax, %edi
    180      1.1  mrg 
    181      1.1  mrg 	adcl	$0, %edx
    182      1.1  mrg 	movl	(%esi), %eax	C xp[0]
    183      1.1  mrg 
    184      1.1  mrg 	movl	%edx, %esi	C carry, for wp[3]
    185      1.1  mrg 
    186      1.1  mrg 	mull	%ecx		C xp[0] * yp[1]
    187      1.1  mrg 
    188      1.1  mrg 	addl	%eax, 4(%ebx)
    189      1.1  mrg 	adcl	%edx, %edi
    190      1.1  mrg 	movl	%edi, 8(%ebx)
    191      1.1  mrg 
    192      1.1  mrg 	adcl	$0, %esi
    193      1.1  mrg 	movl	SAVE_EDI, %edi
    194      1.1  mrg 	movl	%esi, 12(%ebx)
    195      1.1  mrg 
    196      1.1  mrg 	movl	SAVE_ESI, %esi
    197      1.1  mrg 	movl	SAVE_EBX, %ebx
    198      1.1  mrg 	addl	$FRAME, %esp
    199      1.1  mrg 
    200      1.1  mrg 	ret
    201      1.1  mrg 
    202      1.1  mrg 
    203      1.1  mrg C -----------------------------------------------------------------------------
    204      1.1  mrg 	ALIGN(16)
    205      1.1  mrg L(xsize_more_than_two):
    206      1.1  mrg 
    207      1.1  mrg C The first limb of yp is processed with a simple mpn_mul_1 style loop
    208      1.1  mrg C inline.  Unrolling this doesn't seem worthwhile since it's only run once
    209      1.1  mrg C (whereas the addmul below is run ysize-1 many times).  A call to the
    210      1.1  mrg C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
    211      1.1  mrg C popping, and doesn't seem likely to be worthwhile on the typical 13-26
    212      1.1  mrg C limb operations the Karatsuba code calls here with.
    213      1.1  mrg 
    214      1.1  mrg 	C eax	yp[0]
    215      1.1  mrg 	C ebx
    216      1.1  mrg 	C ecx	xsize
    217      1.1  mrg 	C edx	xp
    218      1.1  mrg 	C esi
    219      1.1  mrg 	C edi
    220      1.1  mrg 	C ebp
    221      1.1  mrg 
    222      1.1  mrg dnl  FRAME doesn't carry on from previous, no pushes yet here
    223      1.1  mrg defframe(`SAVE_EBX',-4)
    224      1.1  mrg defframe(`SAVE_ESI',-8)
    225      1.1  mrg defframe(`SAVE_EDI',-12)
    226      1.1  mrg defframe(`SAVE_EBP',-16)
    227      1.1  mrg deflit(`FRAME',0)
    228      1.1  mrg 
    229      1.1  mrg 	subl	$16, %esp
    230      1.1  mrg deflit(`FRAME',16)
    231      1.1  mrg 
    232      1.1  mrg 	movl	%edi, SAVE_EDI
    233      1.1  mrg 	movl	PARAM_WP, %edi
    234      1.1  mrg 
    235      1.1  mrg 	movl	%ebx, SAVE_EBX
    236      1.1  mrg 	movl	%ebp, SAVE_EBP
    237      1.1  mrg 	movl	%eax, %ebp
    238      1.1  mrg 
    239      1.1  mrg 	movl	%esi, SAVE_ESI
    240      1.1  mrg 	xorl	%ebx, %ebx
    241      1.1  mrg 	leal	(%edx,%ecx,4), %esi	C xp end
    242      1.1  mrg 
    243      1.1  mrg 	leal	(%edi,%ecx,4), %edi	C wp end of mul1
    244      1.1  mrg 	negl	%ecx
    245      1.1  mrg 
    246      1.1  mrg 
    247      1.1  mrg L(mul1):
    248      1.1  mrg 	C eax	scratch
    249      1.1  mrg 	C ebx	carry
    250      1.1  mrg 	C ecx	counter, negative
    251      1.1  mrg 	C edx	scratch
    252      1.1  mrg 	C esi	xp end
    253      1.1  mrg 	C edi	wp end of mul1
    254      1.1  mrg 	C ebp	multiplier
    255      1.1  mrg 
    256      1.1  mrg 	movl	(%esi,%ecx,4), %eax
    257      1.1  mrg 
    258      1.1  mrg 	mull	%ebp
    259      1.1  mrg 
    260      1.1  mrg 	addl	%ebx, %eax
    261      1.1  mrg 	movl	%eax, (%edi,%ecx,4)
    262      1.1  mrg 	movl	$0, %ebx
    263      1.1  mrg 
    264      1.1  mrg 	adcl	%edx, %ebx
    265      1.1  mrg 	incl	%ecx
    266      1.1  mrg 	jnz	L(mul1)
    267      1.1  mrg 
    268      1.1  mrg 
    269      1.1  mrg 	movl	PARAM_YSIZE, %edx
    270      1.1  mrg 	movl	PARAM_XSIZE, %ecx
    271      1.1  mrg 
    272      1.1  mrg 	movl	%ebx, (%edi)		C final carry
    273      1.1  mrg 	decl	%edx
    274      1.1  mrg 
    275      1.1  mrg 	jnz	L(ysize_more_than_one)
    276      1.1  mrg 
    277      1.1  mrg 
    278      1.1  mrg 	movl	SAVE_EDI, %edi
    279      1.1  mrg 	movl	SAVE_EBX, %ebx
    280      1.1  mrg 
    281      1.1  mrg 	movl	SAVE_EBP, %ebp
    282      1.1  mrg 	movl	SAVE_ESI, %esi
    283      1.1  mrg 	addl	$FRAME, %esp
    284      1.1  mrg 
    285      1.1  mrg 	ret
    286      1.1  mrg 
    287      1.1  mrg 
    288      1.1  mrg L(ysize_more_than_one):
    289      1.1  mrg 	cmpl	$UNROLL_THRESHOLD, %ecx
    290      1.1  mrg 	movl	PARAM_YP, %eax
    291      1.1  mrg 
    292      1.1  mrg 	jae	L(unroll)
    293      1.1  mrg 
    294      1.1  mrg 
    295      1.1  mrg C -----------------------------------------------------------------------------
    296      1.1  mrg 	C simple addmul looping
    297      1.1  mrg 	C
    298      1.1  mrg 	C eax	yp
    299      1.1  mrg 	C ebx
    300      1.1  mrg 	C ecx	xsize
    301      1.1  mrg 	C edx	ysize-1
    302      1.1  mrg 	C esi	xp end
    303      1.1  mrg 	C edi	wp end of mul1
    304      1.1  mrg 	C ebp
    305      1.1  mrg 
    306      1.1  mrg 	leal	4(%eax,%edx,4), %ebp	C yp end
    307      1.1  mrg 	negl	%ecx
    308      1.1  mrg 	negl	%edx
    309      1.1  mrg 
    310      1.1  mrg 	movl	(%esi,%ecx,4), %eax	C xp low limb
    311      1.1  mrg 	movl	%edx, PARAM_YSIZE	C -(ysize-1)
    312      1.1  mrg 	incl	%ecx
    313      1.1  mrg 
    314      1.1  mrg 	xorl	%ebx, %ebx		C initial carry
    315      1.1  mrg 	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
    316      1.1  mrg 	movl	%ebp, PARAM_YP
    317      1.1  mrg 
    318      1.1  mrg 	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
    319      1.1  mrg 	jmp	L(simple_outer_entry)
    320      1.1  mrg 
    321      1.1  mrg 
    322      1.1  mrg 	C this is offset 0x121 so close enough to aligned
    323      1.1  mrg L(simple_outer_top):
    324      1.1  mrg 	C ebp	ysize counter, negative
    325      1.1  mrg 
    326      1.1  mrg 	movl	PARAM_YP, %edx
    327      1.1  mrg 	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
    328      1.1  mrg 	xorl	%ebx, %ebx		C carry
    329      1.1  mrg 
    330      1.1  mrg 	movl	%ebp, PARAM_YSIZE
    331      1.1  mrg 	addl	$4, %edi		C next position in wp
    332      1.1  mrg 
    333      1.1  mrg 	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
    334      1.1  mrg 	movl	-4(%esi,%ecx,4), %eax	C xp low limb
    335      1.1  mrg 
    336      1.1  mrg 
    337      1.1  mrg L(simple_outer_entry):
    338      1.1  mrg 
    339      1.1  mrg L(simple_inner):
    340      1.1  mrg 	C eax	xp limb
    341      1.1  mrg 	C ebx	carry limb
    342      1.1  mrg 	C ecx	loop counter (negative)
    343      1.1  mrg 	C edx	scratch
    344      1.1  mrg 	C esi	xp end
    345      1.1  mrg 	C edi	wp end
    346      1.1  mrg 	C ebp	multiplier
    347      1.1  mrg 
    348      1.1  mrg 	mull	%ebp
    349      1.1  mrg 
    350      1.1  mrg 	addl	%eax, %ebx
    351      1.1  mrg 	adcl	$0, %edx
    352      1.1  mrg 
    353      1.1  mrg 	addl	%ebx, (%edi,%ecx,4)
    354      1.1  mrg 	movl	(%esi,%ecx,4), %eax
    355      1.1  mrg 	adcl	$0, %edx
    356      1.1  mrg 
    357      1.1  mrg 	incl	%ecx
    358      1.1  mrg 	movl	%edx, %ebx
    359      1.1  mrg 	jnz	L(simple_inner)
    360      1.1  mrg 
    361      1.1  mrg 
    362      1.1  mrg 	mull	%ebp
    363      1.1  mrg 
    364      1.1  mrg 	movl	PARAM_YSIZE, %ebp
    365      1.1  mrg 	addl	%eax, %ebx
    366      1.1  mrg 
    367      1.1  mrg 	adcl	$0, %edx
    368      1.1  mrg 	addl	%ebx, (%edi)
    369      1.1  mrg 
    370      1.1  mrg 	adcl	$0, %edx
    371      1.1  mrg 	incl	%ebp
    372      1.1  mrg 
    373      1.1  mrg 	movl	%edx, 4(%edi)
    374      1.1  mrg 	jnz	L(simple_outer_top)
    375      1.1  mrg 
    376      1.1  mrg 
    377      1.1  mrg 	movl	SAVE_EBX, %ebx
    378      1.1  mrg 	movl	SAVE_ESI, %esi
    379      1.1  mrg 
    380      1.1  mrg 	movl	SAVE_EDI, %edi
    381      1.1  mrg 	movl	SAVE_EBP, %ebp
    382      1.1  mrg 	addl	$FRAME, %esp
    383      1.1  mrg 
    384      1.1  mrg 	ret
    385      1.1  mrg 
    386      1.1  mrg 
    387      1.1  mrg 
    388      1.1  mrg C -----------------------------------------------------------------------------
    389      1.1  mrg C
    390      1.1  mrg C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
    391      1.1  mrg C comments.
    392      1.1  mrg C
    393      1.1  mrg C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
    394      1.1  mrg C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
    395      1.1  mrg C to given an initial VAR_COUNTER at the top of the outer loop.
    396      1.1  mrg C
    397      1.1  mrg C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
    398      1.1  mrg C up to -1, inclusive.
    399      1.1  mrg C
    400      1.1  mrg C VAR_JMP is the computed jump into the unrolled loop.
    401      1.1  mrg C
    402      1.1  mrg C VAR_XP_LOW is the least significant limb of xp, which is needed at the
    403      1.1  mrg C start of the unrolled loop.
    404      1.1  mrg C
    405      1.1  mrg C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
    406      1.1  mrg C inclusive.
    407      1.1  mrg C
    408      1.1  mrg C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
    409      1.1  mrg C added to give the location of the next limb of yp, which is the multiplier
    410      1.1  mrg C in the unrolled loop.
    411      1.1  mrg C
    412      1.1  mrg C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
    413      1.1  mrg C outer loop to take care of xp, wp and the inner loop counter.
    414      1.1  mrg 
    415      1.1  mrg defframe(VAR_COUNTER,  -20)
    416      1.1  mrg defframe(VAR_ADJUST,   -24)
    417      1.1  mrg defframe(VAR_JMP,      -28)
    418      1.1  mrg defframe(VAR_XP_LOW,   -32)
    419      1.1  mrg deflit(VAR_EXTRA_SPACE, 16)
    420      1.1  mrg 
    421      1.1  mrg 
    422      1.1  mrg L(unroll):
    423      1.1  mrg 	C eax	yp
    424      1.1  mrg 	C ebx
    425      1.1  mrg 	C ecx	xsize
    426      1.1  mrg 	C edx	ysize-1
    427      1.1  mrg 	C esi	xp end
    428      1.1  mrg 	C edi	wp end of mul1
    429      1.1  mrg 	C ebp
    430      1.1  mrg 
    431      1.1  mrg 	movl	PARAM_XP, %esi
    432      1.1  mrg 	movl	4(%eax), %ebp		C multiplier (yp second limb)
    433      1.1  mrg 	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
    434      1.1  mrg 
    435      1.1  mrg 	movl	PARAM_WP, %edi
    436      1.1  mrg 	movl	%eax, PARAM_YP
    437      1.1  mrg 	negl	%edx
    438      1.1  mrg 
    439      1.1  mrg 	movl	%edx, PARAM_YSIZE
    440      1.1  mrg 	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
    441      1.1  mrg 	decl	%ecx				C xsize-1
    442      1.1  mrg 
    443      1.1  mrg 	movl	(%esi), %eax		C xp low limb
    444      1.1  mrg 	andl	$-UNROLL_MASK-1, %ebx
    445      1.1  mrg 	negl	%ecx
    446      1.1  mrg 
    447      1.1  mrg 	subl	$VAR_EXTRA_SPACE, %esp
    448      1.1  mrg deflit(`FRAME',16+VAR_EXTRA_SPACE)
    449      1.1  mrg 	negl	%ebx
    450      1.1  mrg 	andl	$UNROLL_MASK, %ecx
    451      1.1  mrg 
    452      1.1  mrg 	movl	%ebx, VAR_ADJUST
    453      1.1  mrg 	movl	%ecx, %edx
    454      1.1  mrg 	shll	$4, %ecx
    455      1.1  mrg 
    456      1.1  mrg 	sarl	$UNROLL_LOG2, %ebx
    457      1.1  mrg 
    458      1.1  mrg 	C 17 code bytes per limb
    459      1.1  mrg ifdef(`PIC',`
    460      1.1  mrg 	call	L(pic_calc)
    461      1.1  mrg L(unroll_here):
    462      1.1  mrg ',`
    463      1.1  mrg 	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
    464      1.1  mrg ')
    465      1.1  mrg 	negl	%edx
    466      1.1  mrg 
    467      1.1  mrg 	movl	%eax, VAR_XP_LOW
    468      1.1  mrg 	movl	%ecx, VAR_JMP
    469      1.1  mrg 	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
    470      1.1  mrg 	leal	4(%esi,%edx,4), %esi	C  and start at second limb
    471      1.1  mrg 	jmp	L(unroll_outer_entry)
    472      1.1  mrg 
    473      1.1  mrg 
    474      1.1  mrg ifdef(`PIC',`
    475      1.1  mrg L(pic_calc):
    476      1.1  mrg 	C See mpn/x86/README about old gas bugs
    477      1.1  mrg 	leal	(%ecx,%edx,1), %ecx
    478      1.1  mrg 	addl	$L(unroll_entry)-L(unroll_here), %ecx
    479      1.1  mrg 	addl	(%esp), %ecx
    480      1.1  mrg 	ret_internal
    481      1.1  mrg ')
    482      1.1  mrg 
    483      1.1  mrg 
    484      1.1  mrg C --------------------------------------------------------------------------
    485      1.1  mrg 	ALIGN(32)
    486      1.1  mrg L(unroll_outer_top):
    487      1.1  mrg 	C ebp	ysize counter, negative
    488      1.1  mrg 
    489      1.1  mrg 	movl	VAR_ADJUST, %ebx
    490      1.1  mrg 	movl	PARAM_YP, %edx
    491      1.1  mrg 
    492      1.1  mrg 	movl	VAR_XP_LOW, %eax
    493      1.1  mrg 	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
    494      1.1  mrg 
    495      1.1  mrg 	leal	4(%edi,%ebx,4), %edi
    496      1.1  mrg 	leal	(%esi,%ebx,4), %esi
    497      1.1  mrg 	sarl	$UNROLL_LOG2, %ebx
    498      1.1  mrg 
    499      1.1  mrg 	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
    500      1.1  mrg 	movl	VAR_JMP, %ecx
    501      1.1  mrg 
    502      1.1  mrg L(unroll_outer_entry):
    503      1.1  mrg 	mull	%ebp
    504      1.1  mrg 
    505      1.1  mrg 	testb	$1, %cl		C and clear carry bit
    506      1.1  mrg 	movl	%ebx, VAR_COUNTER
    507      1.1  mrg 	movl	$0, %ebx
    508      1.1  mrg 
    509      1.1  mrg 	movl	$0, %ecx
    510      1.1  mrg 	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
    511      1.1  mrg 	cmovnz(	%eax, %ebx)
    512      1.1  mrg 
    513      1.1  mrg 	C Extra fetch of VAR_JMP is bad, but registers are tight
    514      1.1  mrg 	jmp	*VAR_JMP
    515      1.1  mrg 
    516      1.1  mrg 
    517      1.1  mrg C -----------------------------------------------------------------------------
    518      1.1  mrg 	ALIGN(32)
    519      1.1  mrg L(unroll_top):
    520      1.1  mrg 	C eax	xp limb
    521      1.1  mrg 	C ebx	carry high
    522      1.1  mrg 	C ecx	carry low
    523      1.1  mrg 	C edx	scratch
    524      1.1  mrg 	C esi	xp+8
    525      1.1  mrg 	C edi	wp
    526      1.1  mrg 	C ebp	yp multiplier limb
    527      1.1  mrg 	C
    528      1.1  mrg 	C VAR_COUNTER  loop counter, negative
    529      1.1  mrg 	C
    530      1.1  mrg 	C 17 bytes each limb
    531      1.1  mrg 
    532      1.1  mrg L(unroll_entry):
    533      1.1  mrg 
    534      1.1  mrg deflit(CHUNK_COUNT,2)
    535      1.1  mrg forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
    536      1.1  mrg 	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
    537      1.1  mrg 	deflit(`disp1', eval(disp0 + 4))
    538      1.1  mrg 
    539      1.1  mrg Zdisp(	movl,	disp0,(%esi), %eax)
    540      1.1  mrg 	adcl	%edx, %ebx
    541      1.1  mrg 
    542      1.1  mrg 	mull	%ebp
    543      1.1  mrg 
    544      1.1  mrg Zdisp(	addl,	%ecx, disp0,(%edi))
    545      1.1  mrg 	movl	$0, %ecx
    546      1.1  mrg 
    547      1.1  mrg 	adcl	%eax, %ebx
    548      1.1  mrg 
    549      1.1  mrg 
    550      1.1  mrg 	movl	disp1(%esi), %eax
    551      1.1  mrg 	adcl	%edx, %ecx
    552      1.1  mrg 
    553      1.1  mrg 	mull	%ebp
    554      1.1  mrg 
    555      1.1  mrg 	addl	%ebx, disp1(%edi)
    556      1.1  mrg 	movl	$0, %ebx
    557      1.1  mrg 
    558      1.1  mrg 	adcl	%eax, %ecx
    559      1.1  mrg ')
    560      1.1  mrg 
    561      1.1  mrg 
    562      1.1  mrg 	incl	VAR_COUNTER
    563      1.1  mrg 	leal	UNROLL_BYTES(%esi), %esi
    564      1.1  mrg 	leal	UNROLL_BYTES(%edi), %edi
    565      1.1  mrg 
    566      1.1  mrg 	jnz	L(unroll_top)
    567      1.1  mrg 
    568      1.1  mrg 
    569      1.1  mrg 	C eax
    570      1.1  mrg 	C ebx	zero
    571      1.1  mrg 	C ecx	low
    572      1.1  mrg 	C edx	high
    573      1.1  mrg 	C esi
    574      1.1  mrg 	C edi	wp, pointing at second last limb)
    575      1.1  mrg 	C ebp
    576      1.1  mrg 	C
    577      1.1  mrg 	C carry flag to be added to high
    578      1.1  mrg 
    579      1.1  mrg deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
    580      1.1  mrg deflit(`disp1', eval(disp0-0 + 4))
    581      1.1  mrg 
    582      1.1  mrg 	movl	PARAM_YSIZE, %ebp
    583      1.1  mrg 	adcl	$0, %edx
    584      1.1  mrg 	addl	%ecx, disp0(%edi)
    585      1.1  mrg 
    586      1.1  mrg 	adcl	$0, %edx
    587      1.1  mrg 	incl	%ebp
    588      1.1  mrg 
    589      1.1  mrg 	movl	%edx, disp1(%edi)
    590      1.1  mrg 	jnz	L(unroll_outer_top)
    591      1.1  mrg 
    592      1.1  mrg 
    593      1.1  mrg 	movl	SAVE_ESI, %esi
    594      1.1  mrg 	movl	SAVE_EBP, %ebp
    595      1.1  mrg 
    596      1.1  mrg 	movl	SAVE_EDI, %edi
    597      1.1  mrg 	movl	SAVE_EBX, %ebx
    598      1.1  mrg 	addl	$FRAME, %esp
    599      1.1  mrg 
    600      1.1  mrg 	ret
    601      1.1  mrg 
    602      1.1  mrg EPILOGUE()
    603