Home | History | Annotate | Line # | Download | only in k6
      1      1.1  mrg dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
      2      1.1  mrg 
      3  1.1.1.3  mrg dnl  Copyright 1999-2003, 2005 Free Software Foundation, Inc.
      4  1.1.1.3  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.3  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg dnl  it under the terms of either:
      9  1.1.1.3  mrg dnl
     10  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg dnl      option) any later version.
     13  1.1.1.3  mrg dnl
     14  1.1.1.3  mrg dnl  or
     15  1.1.1.3  mrg dnl
     16  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg dnl      later version.
     19  1.1.1.3  mrg dnl
     20  1.1.1.3  mrg dnl  or both in parallel, as here.
     21      1.1  mrg dnl
     22  1.1.1.3  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.3  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34  1.1.1.2  mrg C			    cycles/limb
     35  1.1.1.2  mrg C P5
     36  1.1.1.2  mrg C P6 model 0-8,10-12		 5.94
     37  1.1.1.2  mrg C P6 model 9  (Banias)		 5.51
     38  1.1.1.2  mrg C P6 model 13 (Dothan)		 5.57
     39      1.1  mrg C P4 model 0  (Willamette)
     40      1.1  mrg C P4 model 1  (?)
     41      1.1  mrg C P4 model 2  (Northwood)
     42      1.1  mrg C P4 model 3  (Prescott)
     43      1.1  mrg C P4 model 4  (Nocona)
     44  1.1.1.2  mrg C AMD K6			7.65-8.5 (data dependent)
     45  1.1.1.2  mrg C AMD K7
     46  1.1.1.2  mrg C AMD K8
     47      1.1  mrg 
     48      1.1  mrg 
     49      1.1  mrg dnl  K6:           large multipliers  small multipliers
     50      1.1  mrg dnl  UNROLL_COUNT    cycles/limb       cycles/limb
     51      1.1  mrg dnl        4             9.5              7.78
     52      1.1  mrg dnl        8             9.0              7.78
     53      1.1  mrg dnl       16             8.4              7.65
     54      1.1  mrg dnl       32             8.4              8.2
     55      1.1  mrg dnl
     56      1.1  mrg dnl  Maximum possible unrolling with the current code is 32.
     57      1.1  mrg dnl
     58      1.1  mrg dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
     59      1.1  mrg dnl  byte block, which might explain the good speed at that unrolling.
     60      1.1  mrg 
     61      1.1  mrg deflit(UNROLL_COUNT, 16)
     62      1.1  mrg 
     63      1.1  mrg 
     64      1.1  mrg ifdef(`OPERATION_addmul_1', `
     65      1.1  mrg 	define(M4_inst,        addl)
     66      1.1  mrg 	define(M4_function_1,  mpn_addmul_1)
     67      1.1  mrg 	define(M4_function_1c, mpn_addmul_1c)
     68      1.1  mrg ',`ifdef(`OPERATION_submul_1', `
     69      1.1  mrg 	define(M4_inst,        subl)
     70      1.1  mrg 	define(M4_function_1,  mpn_submul_1)
     71      1.1  mrg 	define(M4_function_1c, mpn_submul_1c)
     72      1.1  mrg ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
     73      1.1  mrg ')')')
     74      1.1  mrg 
     75      1.1  mrg MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
     76      1.1  mrg 
     77      1.1  mrg 
     78      1.1  mrg C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     79      1.1  mrg C                         mp_limb_t mult);
     80      1.1  mrg C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     81      1.1  mrg C                          mp_limb_t mult, mp_limb_t carry);
     82      1.1  mrg C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     83      1.1  mrg C                         mp_limb_t mult);
     84      1.1  mrg C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     85      1.1  mrg C                          mp_limb_t mult, mp_limb_t carry);
     86      1.1  mrg C
     87      1.1  mrg C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
     88      1.1  mrg C multipliers (most significant few bits clear) result in few carry bits and
     89      1.1  mrg C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
     90      1.1  mrg C significant few bits set) make the carry bits 50/50 and lead to something
     91      1.1  mrg C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
     92      1.1  mrg C
     93      1.1  mrg C It's important that the gains for jadcl0 on small multipliers don't come
     94      1.1  mrg C at the cost of slowing down other data.  Tests on uniformly distributed
     95      1.1  mrg C random data, designed to confound branch prediction, show about a 7%
     96      1.1  mrg C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
     97      1.1  mrg C overheads included).
     98      1.1  mrg C
     99      1.1  mrg C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
    100      1.1  mrg C 11.0 cycles/limb), and hence isn't used.
    101      1.1  mrg C
    102      1.1  mrg C In the simple loop, note that running ecx from negative to zero and using
    103      1.1  mrg C it as an index in the two movs wouldn't help.  It would save one
    104      1.1  mrg C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
    105      1.1  mrg C that would be collapsed by this.
    106      1.1  mrg C
    107      1.1  mrg C Attempts at a simpler main loop, with less unrolling, haven't yielded much
    108      1.1  mrg C success, generally running over 9 c/l.
    109      1.1  mrg C
    110      1.1  mrg C
    111      1.1  mrg C jadcl0
    112      1.1  mrg C ------
    113      1.1  mrg C
    114      1.1  mrg C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
    115      1.1  mrg C firstly the instruction decoding and secondly the fact that there's a
    116      1.1  mrg C carry bit for the jadcl0 only on average about 1/4 of the time.
    117      1.1  mrg C
    118      1.1  mrg C The code in the unrolled loop decodes something like the following.
    119      1.1  mrg C
    120      1.1  mrg C                                         decode cycles
    121      1.1  mrg C		mull	%ebp                    2
    122      1.1  mrg C		M4_inst	%esi, disp(%edi)        1
    123      1.1  mrg C		adcl	%eax, %ecx              2
    124      1.1  mrg C		movl	%edx, %esi            \ 1
    125      1.1  mrg C		jnc	1f                    /
    126      1.1  mrg C		incl	%esi                  \ 1
    127      1.1  mrg C	1:	movl	disp(%ebx), %eax      /
    128      1.1  mrg C                                              ---
    129      1.1  mrg C                                               7
    130      1.1  mrg C
    131      1.1  mrg C In a back-to-back style test this measures 7 with the jnc not taken, or 8
    132      1.1  mrg C with it taken (both when correctly predicted).  This is opposite to the
    133      1.1  mrg C measurements showing small multipliers running faster than large ones.
    134      1.1  mrg C Don't really know why.
    135      1.1  mrg C
    136      1.1  mrg C It's not clear how much branch misprediction might be costing.  The K6
    137      1.1  mrg C doco says it will be 1 to 4 cycles, but presumably it's near the low end
    138      1.1  mrg C of that range to get the measured results.
    139      1.1  mrg C
    140      1.1  mrg C
    141      1.1  mrg C In the code the two carries are more or less the preceding mul product and
    142      1.1  mrg C the calculation is roughly
    143      1.1  mrg C
    144      1.1  mrg C	x*y + u*b+v
    145      1.1  mrg C
    146      1.1  mrg C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
    147      1.1  mrg C v are the two limbs it's added to (being the low of the next mul, and a
    148      1.1  mrg C limb from the destination).
    149      1.1  mrg C
    150      1.1  mrg C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
    151      1.1  mrg C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
    152      1.1  mrg C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
    153      1.1  mrg C and b-1, then the total probability can be summed over x and y,
    154      1.1  mrg C
    155      1.1  mrg C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
    156      1.1  mrg C	--- * sum sum --- = --- * ------- * ------- = 1/4
    157      1.1  mrg C       b^2   x=0 y=1 b^2   b^4      2         2
    158      1.1  mrg C
    159      1.1  mrg C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
    160      1.1  mrg C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
    161      1.1  mrg 
    162      1.1  mrg 
    163      1.1  mrg ifdef(`PIC',`
    164      1.1  mrg deflit(UNROLL_THRESHOLD, 9)
    165      1.1  mrg ',`
    166      1.1  mrg deflit(UNROLL_THRESHOLD, 6)
    167      1.1  mrg ')
    168      1.1  mrg 
    169      1.1  mrg defframe(PARAM_CARRY,     20)
    170      1.1  mrg defframe(PARAM_MULTIPLIER,16)
    171      1.1  mrg defframe(PARAM_SIZE,      12)
    172      1.1  mrg defframe(PARAM_SRC,       8)
    173      1.1  mrg defframe(PARAM_DST,       4)
    174      1.1  mrg 
    175      1.1  mrg 	TEXT
    176      1.1  mrg 	ALIGN(32)
    177      1.1  mrg 
    178      1.1  mrg PROLOGUE(M4_function_1c)
    179      1.1  mrg 	pushl	%esi
    180      1.1  mrg deflit(`FRAME',4)
    181      1.1  mrg 	movl	PARAM_CARRY, %esi
    182      1.1  mrg 	jmp	L(start_nc)
    183      1.1  mrg EPILOGUE()
    184      1.1  mrg 
    185      1.1  mrg PROLOGUE(M4_function_1)
    186      1.1  mrg 	push	%esi
    187      1.1  mrg deflit(`FRAME',4)
    188      1.1  mrg 	xorl	%esi, %esi	C initial carry
    189      1.1  mrg 
    190      1.1  mrg L(start_nc):
    191      1.1  mrg 	movl	PARAM_SIZE, %ecx
    192      1.1  mrg 	pushl	%ebx
    193      1.1  mrg deflit(`FRAME',8)
    194      1.1  mrg 
    195      1.1  mrg 	movl	PARAM_SRC, %ebx
    196      1.1  mrg 	pushl	%edi
    197      1.1  mrg deflit(`FRAME',12)
    198      1.1  mrg 
    199      1.1  mrg 	cmpl	$UNROLL_THRESHOLD, %ecx
    200      1.1  mrg 	movl	PARAM_DST, %edi
    201      1.1  mrg 
    202      1.1  mrg 	pushl	%ebp
    203      1.1  mrg deflit(`FRAME',16)
    204      1.1  mrg 	jae	L(unroll)
    205      1.1  mrg 
    206      1.1  mrg 
    207      1.1  mrg 	C simple loop
    208      1.1  mrg 
    209      1.1  mrg 	movl	PARAM_MULTIPLIER, %ebp
    210      1.1  mrg 
    211      1.1  mrg L(simple):
    212      1.1  mrg 	C eax	scratch
    213      1.1  mrg 	C ebx	src
    214      1.1  mrg 	C ecx	counter
    215      1.1  mrg 	C edx	scratch
    216      1.1  mrg 	C esi	carry
    217      1.1  mrg 	C edi	dst
    218      1.1  mrg 	C ebp	multiplier
    219      1.1  mrg 
    220      1.1  mrg 	movl	(%ebx), %eax
    221      1.1  mrg 	addl	$4, %ebx
    222      1.1  mrg 
    223      1.1  mrg 	mull	%ebp
    224      1.1  mrg 
    225      1.1  mrg 	addl	$4, %edi
    226      1.1  mrg 	addl	%esi, %eax
    227      1.1  mrg 
    228      1.1  mrg 	adcl	$0, %edx
    229      1.1  mrg 
    230      1.1  mrg 	M4_inst	%eax, -4(%edi)
    231      1.1  mrg 
    232      1.1  mrg 	adcl	$0, %edx
    233      1.1  mrg 
    234      1.1  mrg 	movl	%edx, %esi
    235      1.1  mrg 	loop	L(simple)
    236      1.1  mrg 
    237      1.1  mrg 
    238      1.1  mrg 	popl	%ebp
    239      1.1  mrg 	popl	%edi
    240      1.1  mrg 
    241      1.1  mrg 	popl	%ebx
    242      1.1  mrg 	movl	%esi, %eax
    243      1.1  mrg 
    244      1.1  mrg 	popl	%esi
    245      1.1  mrg 	ret
    246      1.1  mrg 
    247      1.1  mrg 
    248      1.1  mrg 
    249      1.1  mrg C -----------------------------------------------------------------------------
    250      1.1  mrg C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
    251      1.1  mrg C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
    252      1.1  mrg C For the computed jump an odd size means they start one way around, an even
    253      1.1  mrg C size the other.
    254      1.1  mrg C
    255      1.1  mrg C VAR_JUMP holds the computed jump temporarily because there's not enough
    256      1.1  mrg C registers at the point of doing the mul for the initial two carry limbs.
    257      1.1  mrg C
    258      1.1  mrg C The add/adc for the initial carry in %esi is necessary only for the
    259      1.1  mrg C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
    260      1.1  mrg C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
    261      1.1  mrg C idea.
    262      1.1  mrg 
    263      1.1  mrg dnl  overlapping with parameters already fetched
    264      1.1  mrg define(VAR_COUNTER, `PARAM_SIZE')
    265      1.1  mrg define(VAR_JUMP,    `PARAM_DST')
    266      1.1  mrg 
    267      1.1  mrg L(unroll):
    268      1.1  mrg 	C eax
    269      1.1  mrg 	C ebx	src
    270      1.1  mrg 	C ecx	size
    271      1.1  mrg 	C edx
    272      1.1  mrg 	C esi	initial carry
    273      1.1  mrg 	C edi	dst
    274      1.1  mrg 	C ebp
    275      1.1  mrg 
    276      1.1  mrg 	movl	%ecx, %edx
    277      1.1  mrg 	decl	%ecx
    278      1.1  mrg 
    279      1.1  mrg 	subl	$2, %edx
    280      1.1  mrg 	negl	%ecx
    281      1.1  mrg 
    282      1.1  mrg 	shrl	$UNROLL_LOG2, %edx
    283      1.1  mrg 	andl	$UNROLL_MASK, %ecx
    284      1.1  mrg 
    285      1.1  mrg 	movl	%edx, VAR_COUNTER
    286      1.1  mrg 	movl	%ecx, %edx
    287      1.1  mrg 
    288      1.1  mrg 	shll	$4, %edx
    289      1.1  mrg 	negl	%ecx
    290      1.1  mrg 
    291      1.1  mrg 	C 15 code bytes per limb
    292      1.1  mrg ifdef(`PIC',`
    293      1.1  mrg 	call	L(pic_calc)
    294      1.1  mrg L(here):
    295      1.1  mrg ',`
    296      1.1  mrg 	leal	L(entry) (%edx,%ecx,1), %edx
    297      1.1  mrg ')
    298      1.1  mrg 	movl	(%ebx), %eax		C src low limb
    299      1.1  mrg 
    300      1.1  mrg 	movl	PARAM_MULTIPLIER, %ebp
    301      1.1  mrg 	movl	%edx, VAR_JUMP
    302      1.1  mrg 
    303      1.1  mrg 	mull	%ebp
    304      1.1  mrg 
    305      1.1  mrg 	addl	%esi, %eax	C initial carry (from _1c)
    306      1.1  mrg 	jadcl0(	%edx)
    307      1.1  mrg 
    308      1.1  mrg 
    309      1.1  mrg 	leal	4(%ebx,%ecx,4), %ebx
    310      1.1  mrg 	movl	%edx, %esi	C high carry
    311      1.1  mrg 
    312      1.1  mrg 	movl	VAR_JUMP, %edx
    313      1.1  mrg 	leal	(%edi,%ecx,4), %edi
    314      1.1  mrg 
    315      1.1  mrg 	testl	$1, %ecx
    316      1.1  mrg 	movl	%eax, %ecx	C low carry
    317      1.1  mrg 
    318      1.1  mrg 	jz	L(noswap)
    319      1.1  mrg 	movl	%esi, %ecx	C high,low carry other way around
    320      1.1  mrg 
    321      1.1  mrg 	movl	%eax, %esi
    322      1.1  mrg L(noswap):
    323      1.1  mrg 
    324      1.1  mrg 	jmp	*%edx
    325      1.1  mrg 
    326      1.1  mrg 
    327      1.1  mrg ifdef(`PIC',`
    328      1.1  mrg L(pic_calc):
    329      1.1  mrg 	C See mpn/x86/README about old gas bugs
    330      1.1  mrg 	leal	(%edx,%ecx,1), %edx
    331      1.1  mrg 	addl	$L(entry)-L(here), %edx
    332      1.1  mrg 	addl	(%esp), %edx
    333      1.1  mrg 	ret_internal
    334      1.1  mrg ')
    335      1.1  mrg 
    336      1.1  mrg 
    337      1.1  mrg C -----------------------------------------------------------
    338      1.1  mrg 	ALIGN(32)
    339      1.1  mrg L(top):
    340      1.1  mrg deflit(`FRAME',16)
    341      1.1  mrg 	C eax	scratch
    342      1.1  mrg 	C ebx	src
    343      1.1  mrg 	C ecx	carry lo
    344      1.1  mrg 	C edx	scratch
    345      1.1  mrg 	C esi	carry hi
    346      1.1  mrg 	C edi	dst
    347      1.1  mrg 	C ebp	multiplier
    348      1.1  mrg 	C
    349      1.1  mrg 	C 15 code bytes per limb
    350      1.1  mrg 
    351      1.1  mrg 	leal	UNROLL_BYTES(%edi), %edi
    352      1.1  mrg 
    353      1.1  mrg L(entry):
    354      1.1  mrg forloop(`i', 0, UNROLL_COUNT/2-1, `
    355      1.1  mrg 	deflit(`disp0', eval(2*i*4))
    356      1.1  mrg 	deflit(`disp1', eval(disp0 + 4))
    357      1.1  mrg 
    358      1.1  mrg Zdisp(	movl,	disp0,(%ebx), %eax)
    359      1.1  mrg 	mull	%ebp
    360      1.1  mrg Zdisp(	M4_inst,%ecx, disp0,(%edi))
    361      1.1  mrg 	adcl	%eax, %esi
    362      1.1  mrg 	movl	%edx, %ecx
    363      1.1  mrg 	jadcl0(	%ecx)
    364      1.1  mrg 
    365      1.1  mrg 	movl	disp1(%ebx), %eax
    366      1.1  mrg 	mull	%ebp
    367      1.1  mrg 	M4_inst	%esi, disp1(%edi)
    368      1.1  mrg 	adcl	%eax, %ecx
    369      1.1  mrg 	movl	%edx, %esi
    370      1.1  mrg 	jadcl0(	%esi)
    371      1.1  mrg ')
    372      1.1  mrg 
    373      1.1  mrg 	decl	VAR_COUNTER
    374      1.1  mrg 
    375      1.1  mrg 	leal	UNROLL_BYTES(%ebx), %ebx
    376      1.1  mrg 	jns	L(top)
    377      1.1  mrg 
    378      1.1  mrg 
    379      1.1  mrg 	popl	%ebp
    380      1.1  mrg 	M4_inst	%ecx, UNROLL_BYTES(%edi)
    381      1.1  mrg 
    382      1.1  mrg 	popl	%edi
    383      1.1  mrg 	movl	%esi, %eax
    384      1.1  mrg 
    385      1.1  mrg 	popl	%ebx
    386      1.1  mrg 	jadcl0(	%eax)
    387      1.1  mrg 
    388      1.1  mrg 	popl	%esi
    389      1.1  mrg 	ret
    390      1.1  mrg 
    391      1.1  mrg EPILOGUE()
    392