Home | History | Annotate | Line # | Download | only in k6
      1 dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
      2 
      3 dnl  Copyright 1999-2003, 2005 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C			    cycles/limb
     35 C P5
     36 C P6 model 0-8,10-12		 5.94
     37 C P6 model 9  (Banias)		 5.51
     38 C P6 model 13 (Dothan)		 5.57
     39 C P4 model 0  (Willamette)
     40 C P4 model 1  (?)
     41 C P4 model 2  (Northwood)
     42 C P4 model 3  (Prescott)
     43 C P4 model 4  (Nocona)
     44 C AMD K6			7.65-8.5 (data dependent)
     45 C AMD K7
     46 C AMD K8
     47 
     48 
     49 dnl  K6:           large multipliers  small multipliers
     50 dnl  UNROLL_COUNT    cycles/limb       cycles/limb
     51 dnl        4             9.5              7.78
     52 dnl        8             9.0              7.78
     53 dnl       16             8.4              7.65
     54 dnl       32             8.4              8.2
     55 dnl
     56 dnl  Maximum possible unrolling with the current code is 32.
     57 dnl
     58 dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
     59 dnl  byte block, which might explain the good speed at that unrolling.
     60 
     61 deflit(UNROLL_COUNT, 16)
     62 
     63 
     64 ifdef(`OPERATION_addmul_1', `
     65 	define(M4_inst,        addl)
     66 	define(M4_function_1,  mpn_addmul_1)
     67 	define(M4_function_1c, mpn_addmul_1c)
     68 ',`ifdef(`OPERATION_submul_1', `
     69 	define(M4_inst,        subl)
     70 	define(M4_function_1,  mpn_submul_1)
     71 	define(M4_function_1c, mpn_submul_1c)
     72 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
     73 ')')')
     74 
     75 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
     76 
     77 
     78 C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     79 C                         mp_limb_t mult);
     80 C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     81 C                          mp_limb_t mult, mp_limb_t carry);
     82 C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     83 C                         mp_limb_t mult);
     84 C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
     85 C                          mp_limb_t mult, mp_limb_t carry);
     86 C
     87 C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
     88 C multipliers (most significant few bits clear) result in few carry bits and
     89 C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
     90 C significant few bits set) make the carry bits 50/50 and lead to something
     91 C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
     92 C
     93 C It's important that the gains for jadcl0 on small multipliers don't come
     94 C at the cost of slowing down other data.  Tests on uniformly distributed
     95 C random data, designed to confound branch prediction, show about a 7%
     96 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
     97 C overheads included).
     98 C
     99 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
    100 C 11.0 cycles/limb), and hence isn't used.
    101 C
    102 C In the simple loop, note that running ecx from negative to zero and using
    103 C it as an index in the two movs wouldn't help.  It would save one
    104 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
    105 C that would be collapsed by this.
    106 C
    107 C Attempts at a simpler main loop, with less unrolling, haven't yielded much
    108 C success, generally running over 9 c/l.
    109 C
    110 C
    111 C jadcl0
    112 C ------
    113 C
    114 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
    115 C firstly the instruction decoding and secondly the fact that there's a
    116 C carry bit for the jadcl0 only on average about 1/4 of the time.
    117 C
    118 C The code in the unrolled loop decodes something like the following.
    119 C
    120 C                                         decode cycles
    121 C		mull	%ebp                    2
    122 C		M4_inst	%esi, disp(%edi)        1
    123 C		adcl	%eax, %ecx              2
    124 C		movl	%edx, %esi            \ 1
    125 C		jnc	1f                    /
    126 C		incl	%esi                  \ 1
    127 C	1:	movl	disp(%ebx), %eax      /
    128 C                                              ---
    129 C                                               7
    130 C
    131 C In a back-to-back style test this measures 7 with the jnc not taken, or 8
    132 C with it taken (both when correctly predicted).  This is opposite to the
    133 C measurements showing small multipliers running faster than large ones.
    134 C Don't really know why.
    135 C
    136 C It's not clear how much branch misprediction might be costing.  The K6
    137 C doco says it will be 1 to 4 cycles, but presumably it's near the low end
    138 C of that range to get the measured results.
    139 C
    140 C
    141 C In the code the two carries are more or less the preceding mul product and
    142 C the calculation is roughly
    143 C
    144 C	x*y + u*b+v
    145 C
    146 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
    147 C v are the two limbs it's added to (being the low of the next mul, and a
    148 C limb from the destination).
    149 C
    150 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
    151 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
    152 C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
    153 C and b-1, then the total probability can be summed over x and y,
    154 C
    155 C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
    156 C	--- * sum sum --- = --- * ------- * ------- = 1/4
    157 C       b^2   x=0 y=1 b^2   b^4      2         2
    158 C
    159 C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
    160 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
    161 
    162 
    163 ifdef(`PIC',`
    164 deflit(UNROLL_THRESHOLD, 9)
    165 ',`
    166 deflit(UNROLL_THRESHOLD, 6)
    167 ')
    168 
    169 defframe(PARAM_CARRY,     20)
    170 defframe(PARAM_MULTIPLIER,16)
    171 defframe(PARAM_SIZE,      12)
    172 defframe(PARAM_SRC,       8)
    173 defframe(PARAM_DST,       4)
    174 
    175 	TEXT
    176 	ALIGN(32)
    177 
    178 PROLOGUE(M4_function_1c)
    179 	pushl	%esi
    180 deflit(`FRAME',4)
    181 	movl	PARAM_CARRY, %esi
    182 	jmp	L(start_nc)
    183 EPILOGUE()
    184 
    185 PROLOGUE(M4_function_1)
    186 	push	%esi
    187 deflit(`FRAME',4)
    188 	xorl	%esi, %esi	C initial carry
    189 
    190 L(start_nc):
    191 	movl	PARAM_SIZE, %ecx
    192 	pushl	%ebx
    193 deflit(`FRAME',8)
    194 
    195 	movl	PARAM_SRC, %ebx
    196 	pushl	%edi
    197 deflit(`FRAME',12)
    198 
    199 	cmpl	$UNROLL_THRESHOLD, %ecx
    200 	movl	PARAM_DST, %edi
    201 
    202 	pushl	%ebp
    203 deflit(`FRAME',16)
    204 	jae	L(unroll)
    205 
    206 
    207 	C simple loop
    208 
    209 	movl	PARAM_MULTIPLIER, %ebp
    210 
    211 L(simple):
    212 	C eax	scratch
    213 	C ebx	src
    214 	C ecx	counter
    215 	C edx	scratch
    216 	C esi	carry
    217 	C edi	dst
    218 	C ebp	multiplier
    219 
    220 	movl	(%ebx), %eax
    221 	addl	$4, %ebx
    222 
    223 	mull	%ebp
    224 
    225 	addl	$4, %edi
    226 	addl	%esi, %eax
    227 
    228 	adcl	$0, %edx
    229 
    230 	M4_inst	%eax, -4(%edi)
    231 
    232 	adcl	$0, %edx
    233 
    234 	movl	%edx, %esi
    235 	loop	L(simple)
    236 
    237 
    238 	popl	%ebp
    239 	popl	%edi
    240 
    241 	popl	%ebx
    242 	movl	%esi, %eax
    243 
    244 	popl	%esi
    245 	ret
    246 
    247 
    248 
    249 C -----------------------------------------------------------------------------
    250 C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
    251 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
    252 C For the computed jump an odd size means they start one way around, an even
    253 C size the other.
    254 C
    255 C VAR_JUMP holds the computed jump temporarily because there's not enough
    256 C registers at the point of doing the mul for the initial two carry limbs.
    257 C
    258 C The add/adc for the initial carry in %esi is necessary only for the
    259 C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
    260 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
    261 C idea.
    262 
    263 dnl  overlapping with parameters already fetched
    264 define(VAR_COUNTER, `PARAM_SIZE')
    265 define(VAR_JUMP,    `PARAM_DST')
    266 
    267 L(unroll):
    268 	C eax
    269 	C ebx	src
    270 	C ecx	size
    271 	C edx
    272 	C esi	initial carry
    273 	C edi	dst
    274 	C ebp
    275 
    276 	movl	%ecx, %edx
    277 	decl	%ecx
    278 
    279 	subl	$2, %edx
    280 	negl	%ecx
    281 
    282 	shrl	$UNROLL_LOG2, %edx
    283 	andl	$UNROLL_MASK, %ecx
    284 
    285 	movl	%edx, VAR_COUNTER
    286 	movl	%ecx, %edx
    287 
    288 	shll	$4, %edx
    289 	negl	%ecx
    290 
    291 	C 15 code bytes per limb
    292 ifdef(`PIC',`
    293 	call	L(pic_calc)
    294 L(here):
    295 ',`
    296 	leal	L(entry) (%edx,%ecx,1), %edx
    297 ')
    298 	movl	(%ebx), %eax		C src low limb
    299 
    300 	movl	PARAM_MULTIPLIER, %ebp
    301 	movl	%edx, VAR_JUMP
    302 
    303 	mull	%ebp
    304 
    305 	addl	%esi, %eax	C initial carry (from _1c)
    306 	jadcl0(	%edx)
    307 
    308 
    309 	leal	4(%ebx,%ecx,4), %ebx
    310 	movl	%edx, %esi	C high carry
    311 
    312 	movl	VAR_JUMP, %edx
    313 	leal	(%edi,%ecx,4), %edi
    314 
    315 	testl	$1, %ecx
    316 	movl	%eax, %ecx	C low carry
    317 
    318 	jz	L(noswap)
    319 	movl	%esi, %ecx	C high,low carry other way around
    320 
    321 	movl	%eax, %esi
    322 L(noswap):
    323 
    324 	jmp	*%edx
    325 
    326 
    327 ifdef(`PIC',`
    328 L(pic_calc):
    329 	C See mpn/x86/README about old gas bugs
    330 	leal	(%edx,%ecx,1), %edx
    331 	addl	$L(entry)-L(here), %edx
    332 	addl	(%esp), %edx
    333 	ret_internal
    334 ')
    335 
    336 
    337 C -----------------------------------------------------------
    338 	ALIGN(32)
    339 L(top):
    340 deflit(`FRAME',16)
    341 	C eax	scratch
    342 	C ebx	src
    343 	C ecx	carry lo
    344 	C edx	scratch
    345 	C esi	carry hi
    346 	C edi	dst
    347 	C ebp	multiplier
    348 	C
    349 	C 15 code bytes per limb
    350 
    351 	leal	UNROLL_BYTES(%edi), %edi
    352 
    353 L(entry):
    354 forloop(`i', 0, UNROLL_COUNT/2-1, `
    355 	deflit(`disp0', eval(2*i*4))
    356 	deflit(`disp1', eval(disp0 + 4))
    357 
    358 Zdisp(	movl,	disp0,(%ebx), %eax)
    359 	mull	%ebp
    360 Zdisp(	M4_inst,%ecx, disp0,(%edi))
    361 	adcl	%eax, %esi
    362 	movl	%edx, %ecx
    363 	jadcl0(	%ecx)
    364 
    365 	movl	disp1(%ebx), %eax
    366 	mull	%ebp
    367 	M4_inst	%esi, disp1(%edi)
    368 	adcl	%eax, %ecx
    369 	movl	%edx, %esi
    370 	jadcl0(	%esi)
    371 ')
    372 
    373 	decl	VAR_COUNTER
    374 
    375 	leal	UNROLL_BYTES(%ebx), %ebx
    376 	jns	L(top)
    377 
    378 
    379 	popl	%ebp
    380 	M4_inst	%ecx, UNROLL_BYTES(%edi)
    381 
    382 	popl	%edi
    383 	movl	%esi, %eax
    384 
    385 	popl	%ebx
    386 	jadcl0(	%eax)
    387 
    388 	popl	%esi
    389 	ret
    390 
    391 EPILOGUE()
    392