Home | History | Annotate | Line # | Download | only in mmx
      1 dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
      2 
      3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
     35 
     36 
     37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
     38 C                         mp_srcptr src, mp_size_t size,
     39 C                         mp_limb_t divisor);
     40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
     41 C                          mp_srcptr src, mp_size_t size,
     42 C                          mp_limb_t divisor, mp_limb_t carry);
     43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
     44 C                                mp_srcptr src, mp_size_t size,
     45 C                                mp_limb_t divisor, mp_limb_t inverse,
     46 C                                unsigned shift);
     47 C
     48 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
     49 C see that file for some comments.  It's possible what's here can be improved.
     50 
     51 
     52 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
     53 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
     54 dnl
     55 dnl  The different speeds of the integer and fraction parts means that using
     56 dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
     57 dnl  for the integer part and a bit lower for the fraction part.  (Or what's
     58 dnl  really wanted is to speed up the integer part!)
     59 dnl
     60 dnl  The threshold is set to make the integer part right.  At 4 limbs the
     61 dnl  div and mul are about the same there, but on the fractional part the
     62 dnl  mul is much faster.
     63 
     64 deflit(MUL_THRESHOLD, 4)
     65 
     66 
     67 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
     68 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
     69 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
     70 defframe(PARAM_DIVISOR,20)
     71 defframe(PARAM_SIZE,   16)
     72 defframe(PARAM_SRC,    12)
     73 defframe(PARAM_XSIZE,  8)
     74 defframe(PARAM_DST,    4)
     75 
     76 defframe(SAVE_EBX,    -4)
     77 defframe(SAVE_ESI,    -8)
     78 defframe(SAVE_EDI,    -12)
     79 defframe(SAVE_EBP,    -16)
     80 
     81 defframe(VAR_NORM,    -20)
     82 defframe(VAR_INVERSE, -24)
     83 defframe(VAR_SRC,     -28)
     84 defframe(VAR_DST,     -32)
     85 defframe(VAR_DST_STOP,-36)
     86 
     87 deflit(STACK_SPACE, 36)
     88 
     89 	TEXT
     90 	ALIGN(16)
     91 
     92 PROLOGUE(mpn_preinv_divrem_1)
     93 deflit(`FRAME',0)
     94 	movl	PARAM_XSIZE, %ecx
     95 	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
     96 
     97 	movl	%esi, SAVE_ESI
     98 	movl	PARAM_SRC, %esi
     99 
    100 	movl	%ebx, SAVE_EBX
    101 	movl	PARAM_SIZE, %ebx
    102 
    103 	movl	%ebp, SAVE_EBP
    104 	movl	PARAM_DIVISOR, %ebp
    105 
    106 	movl	%edi, SAVE_EDI
    107 	movl	PARAM_DST, %edx
    108 
    109 	movl	-4(%esi,%ebx,4), %eax	C src high limb
    110 	xorl	%edi, %edi		C initial carry (if can't skip a div)
    111 
    112 	C
    113 
    114 	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
    115 	xor	%ecx, %ecx
    116 
    117 	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
    118 	cmpl	%ebp, %eax		C high cmp divisor
    119 
    120 	cmovc(	%eax, %edi)		C high is carry if high<divisor
    121 
    122 	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
    123 					C (the latter in case src==dst)
    124 
    125 	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
    126 
    127 	sbbl	$0, %ebx		C skip one division if high<divisor
    128 	movl	PARAM_PREINV_SHIFT, %ecx
    129 
    130 	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
    131 	movl	$32, %eax
    132 
    133 	movl	%edx, VAR_DST		C &dst[xsize+size]
    134 
    135 	shll	%cl, %ebp		C d normalized
    136 	subl	%ecx, %eax
    137 	movl	%ecx, VAR_NORM
    138 
    139 	movd	%eax, %mm7		C rshift
    140 	movl	PARAM_PREINV_INVERSE, %eax
    141 	jmp	L(start_preinv)
    142 
    143 EPILOGUE()
    144 
    145 
    146 
    147 	ALIGN(16)
    148 
    149 PROLOGUE(mpn_divrem_1c)
    150 deflit(`FRAME',0)
    151 	movl	PARAM_CARRY, %edx
    152 
    153 	movl	PARAM_SIZE, %ecx
    154 	subl	$STACK_SPACE, %esp
    155 deflit(`FRAME',STACK_SPACE)
    156 
    157 	movl	%ebx, SAVE_EBX
    158 	movl	PARAM_XSIZE, %ebx
    159 
    160 	movl	%edi, SAVE_EDI
    161 	movl	PARAM_DST, %edi
    162 
    163 	movl	%ebp, SAVE_EBP
    164 	movl	PARAM_DIVISOR, %ebp
    165 
    166 	movl	%esi, SAVE_ESI
    167 	movl	PARAM_SRC, %esi
    168 
    169 	leal	-4(%edi,%ebx,4), %edi
    170 	jmp	L(start_1c)
    171 
    172 EPILOGUE()
    173 
    174 
    175 	C offset 0x31, close enough to aligned
    176 PROLOGUE(mpn_divrem_1)
    177 deflit(`FRAME',0)
    178 
    179 	movl	PARAM_SIZE, %ecx
    180 	movl	$0, %edx		C initial carry (if can't skip a div)
    181 	subl	$STACK_SPACE, %esp
    182 deflit(`FRAME',STACK_SPACE)
    183 
    184 	movl	%ebp, SAVE_EBP
    185 	movl	PARAM_DIVISOR, %ebp
    186 
    187 	movl	%ebx, SAVE_EBX
    188 	movl	PARAM_XSIZE, %ebx
    189 
    190 	movl	%esi, SAVE_ESI
    191 	movl	PARAM_SRC, %esi
    192 	orl	%ecx, %ecx		C size
    193 
    194 	movl	%edi, SAVE_EDI
    195 	movl	PARAM_DST, %edi
    196 
    197 	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
    198 	jz	L(no_skip_div)		C if size==0
    199 
    200 	movl	-4(%esi,%ecx,4), %eax	C src high limb
    201 	xorl	%esi, %esi
    202 	cmpl	%ebp, %eax		C high cmp divisor
    203 
    204 	cmovc(	%eax, %edx)		C high is carry if high<divisor
    205 
    206 	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
    207 					C (the latter in case src==dst)
    208 
    209 	movl	%esi, (%edi,%ecx,4)	C dst high limb
    210 
    211 	sbbl	$0, %ecx		C size-1 if high<divisor
    212 	movl	PARAM_SRC, %esi		C reload
    213 L(no_skip_div):
    214 
    215 
    216 L(start_1c):
    217 	C eax
    218 	C ebx	xsize
    219 	C ecx	size
    220 	C edx	carry
    221 	C esi	src
    222 	C edi	&dst[xsize-1]
    223 	C ebp	divisor
    224 
    225 	leal	(%ebx,%ecx), %eax	C size+xsize
    226 	cmpl	$MUL_THRESHOLD, %eax
    227 	jae	L(mul_by_inverse)
    228 
    229 	orl	%ecx, %ecx
    230 	jz	L(divide_no_integer)
    231 
    232 L(divide_integer):
    233 	C eax	scratch (quotient)
    234 	C ebx	xsize
    235 	C ecx	counter
    236 	C edx	scratch (remainder)
    237 	C esi	src
    238 	C edi	&dst[xsize-1]
    239 	C ebp	divisor
    240 
    241 	movl	-4(%esi,%ecx,4), %eax
    242 
    243 	divl	%ebp
    244 
    245 	movl	%eax, (%edi,%ecx,4)
    246 	decl	%ecx
    247 	jnz	L(divide_integer)
    248 
    249 
    250 L(divide_no_integer):
    251 	movl	PARAM_DST, %edi
    252 	orl	%ebx, %ebx
    253 	jnz	L(divide_fraction)
    254 
    255 L(divide_done):
    256 	movl	SAVE_ESI, %esi
    257 
    258 	movl	SAVE_EDI, %edi
    259 
    260 	movl	SAVE_EBX, %ebx
    261 	movl	%edx, %eax
    262 
    263 	movl	SAVE_EBP, %ebp
    264 	addl	$STACK_SPACE, %esp
    265 
    266 	ret
    267 
    268 
    269 L(divide_fraction):
    270 	C eax	scratch (quotient)
    271 	C ebx	counter
    272 	C ecx
    273 	C edx	scratch (remainder)
    274 	C esi
    275 	C edi	dst
    276 	C ebp	divisor
    277 
    278 	movl	$0, %eax
    279 
    280 	divl	%ebp
    281 
    282 	movl	%eax, -4(%edi,%ebx,4)
    283 	decl	%ebx
    284 	jnz	L(divide_fraction)
    285 
    286 	jmp	L(divide_done)
    287 
    288 
    289 
    290 C -----------------------------------------------------------------------------
    291 
    292 L(mul_by_inverse):
    293 	C eax
    294 	C ebx	xsize
    295 	C ecx	size
    296 	C edx	carry
    297 	C esi	src
    298 	C edi	&dst[xsize-1]
    299 	C ebp	divisor
    300 
    301 	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
    302 
    303 	movl	%ebx, VAR_DST_STOP
    304 	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
    305 
    306 	movl	%edi, VAR_DST
    307 	movl	%ecx, %ebx		C size
    308 
    309 	bsrl	%ebp, %ecx		C 31-l
    310 	movl	%edx, %edi		C carry
    311 
    312 	leal	1(%ecx), %eax		C 32-l
    313 	xorl	$31, %ecx		C l
    314 
    315 	movl	%ecx, VAR_NORM
    316 	movl	$-1, %edx
    317 
    318 	shll	%cl, %ebp		C d normalized
    319 	movd	%eax, %mm7
    320 
    321 	movl	$-1, %eax
    322 	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
    323 
    324 	divl	%ebp			C floor (b*(b-d)-1) / d
    325 
    326 L(start_preinv):
    327 	C eax	inverse
    328 	C ebx	size
    329 	C ecx	shift
    330 	C edx
    331 	C esi	src
    332 	C edi	carry
    333 	C ebp	divisor
    334 	C
    335 	C mm7	rshift
    336 
    337 	movl	%eax, VAR_INVERSE
    338 	orl	%ebx, %ebx		C size
    339 	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
    340 
    341 	movl	%eax, VAR_SRC
    342 	jz	L(start_zero)
    343 
    344 	movl	8(%eax), %esi		C src high limb
    345 	cmpl	$1, %ebx
    346 	jz	L(start_one)
    347 
    348 L(start_two_or_more):
    349 	movl	4(%eax), %edx		C src second highest limb
    350 
    351 	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
    352 
    353 	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
    354 
    355 	cmpl	$2, %ebx
    356 	je	L(integer_two_left)
    357 	jmp	L(integer_top)
    358 
    359 
    360 L(start_one):
    361 	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
    362 
    363 	shll	%cl, %esi		C n10 = high << l
    364 	jmp	L(integer_one_left)
    365 
    366 
    367 L(start_zero):
    368 	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
    369 	C skipped a division.
    370 
    371 	shll	%cl, %edi		C n2 = carry << l
    372 	movl	%edi, %eax		C return value for zero_done
    373 	cmpl	$0, PARAM_XSIZE
    374 
    375 	je	L(zero_done)
    376 	jmp	L(fraction_some)
    377 
    378 
    379 
    380 C -----------------------------------------------------------------------------
    381 C
    382 C This loop runs at about 25 cycles, which is probably sub-optimal, and
    383 C certainly more than the dependent chain would suggest.  A better loop, or
    384 C a better rough analysis of what's possible, would be welcomed.
    385 C
    386 C In the current implementation, the following successively dependent
    387 C micro-ops seem to exist.
    388 C
    389 C		       uops
    390 C		n2+n1	1   (addl)
    391 C		mul	5
    392 C		q1+1	3   (addl/adcl)
    393 C		mul	5
    394 C		sub	3   (subl/sbbl)
    395 C		addback	2   (cmov)
    396 C		       ---
    397 C		       19
    398 C
    399 C Lack of registers hinders explicit scheduling and it might be that the
    400 C normal out of order execution isn't able to hide enough under the mul
    401 C latencies.
    402 C
    403 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
    404 C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
    405 C combination was tried for the addback (despite the fact it would lengthen
    406 C the dependent chain) but found to be no faster.
    407 
    408 
    409 	ALIGN(16)
    410 L(integer_top):
    411 	C eax	scratch
    412 	C ebx	scratch (nadj, q1)
    413 	C ecx	scratch (src, dst)
    414 	C edx	scratch
    415 	C esi	n10
    416 	C edi	n2
    417 	C ebp	d
    418 	C
    419 	C mm0	scratch (src qword)
    420 	C mm7	rshift for normalization
    421 
    422 	movl	%esi, %eax
    423 	movl	%ebp, %ebx
    424 
    425 	sarl	$31, %eax          C -n1
    426 	movl	VAR_SRC, %ecx
    427 
    428 	andl	%eax, %ebx         C -n1 & d
    429 	negl	%eax               C n1
    430 
    431 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
    432 	addl	%edi, %eax         C n2+n1
    433 	movq	(%ecx), %mm0       C next src limb and the one below it
    434 
    435 	mull	VAR_INVERSE        C m*(n2+n1)
    436 
    437 	subl	$4, %ecx
    438 
    439 	movl	%ecx, VAR_SRC
    440 
    441 	C
    442 
    443 	C
    444 
    445 	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
    446 	movl	%ebp, %eax	   C d
    447 	leal	1(%edi), %ebx      C n2+1
    448 
    449 	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
    450 	jz	L(q1_ff)
    451 
    452 	mull	%ebx		   C (q1+1)*d
    453 
    454 	movl	VAR_DST, %ecx
    455 	psrlq	%mm7, %mm0
    456 
    457 	C
    458 
    459 	C
    460 
    461 	C
    462 
    463 	subl	%eax, %esi
    464 	movl	VAR_DST_STOP, %eax
    465 
    466 	sbbl	%edx, %edi	   C n - (q1+1)*d
    467 	movl	%esi, %edi	   C remainder -> n2
    468 	leal	(%ebp,%esi), %edx
    469 
    470 	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
    471 	movd	%mm0, %esi
    472 
    473 	sbbl	$0, %ebx	   C q
    474 	subl	$4, %ecx
    475 
    476 	movl	%ebx, (%ecx)
    477 	cmpl	%eax, %ecx
    478 
    479 	movl	%ecx, VAR_DST
    480 	jne	L(integer_top)
    481 
    482 
    483 L(integer_loop_done):
    484 
    485 
    486 C -----------------------------------------------------------------------------
    487 C
    488 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
    489 C q1_ff special case.  This make the code a bit smaller and simpler, and
    490 C costs only 2 cycles (each).
    491 
    492 L(integer_two_left):
    493 	C eax	scratch
    494 	C ebx	scratch (nadj, q1)
    495 	C ecx	scratch (src, dst)
    496 	C edx	scratch
    497 	C esi	n10
    498 	C edi	n2
    499 	C ebp	divisor
    500 	C
    501 	C mm7	rshift
    502 
    503 
    504 	movl	%esi, %eax
    505 	movl	%ebp, %ebx
    506 
    507 	sarl	$31, %eax          C -n1
    508 	movl	PARAM_SRC, %ecx
    509 
    510 	andl	%eax, %ebx         C -n1 & d
    511 	negl	%eax               C n1
    512 
    513 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
    514 	addl	%edi, %eax         C n2+n1
    515 
    516 	mull	VAR_INVERSE        C m*(n2+n1)
    517 
    518 	movd	(%ecx), %mm0	   C src low limb
    519 
    520 	movl	VAR_DST_STOP, %ecx
    521 
    522 	C
    523 
    524 	C
    525 
    526 	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
    527 	leal	1(%edi), %ebx      C n2+1
    528 	movl	%ebp, %eax	   C d
    529 
    530 	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
    531 
    532 	sbbl	$0, %ebx
    533 
    534 	mull	%ebx		   C (q1+1)*d
    535 
    536 	psllq	$32, %mm0
    537 
    538 	psrlq	%mm7, %mm0
    539 
    540 	C
    541 
    542 	C
    543 
    544 	subl	%eax, %esi
    545 
    546 	sbbl	%edx, %edi	   C n - (q1+1)*d
    547 	movl	%esi, %edi	   C remainder -> n2
    548 	leal	(%ebp,%esi), %edx
    549 
    550 	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
    551 	movd	%mm0, %esi
    552 
    553 	sbbl	$0, %ebx	   C q
    554 
    555 	movl	%ebx, -4(%ecx)
    556 
    557 
    558 C -----------------------------------------------------------------------------
    559 L(integer_one_left):
    560 	C eax	scratch
    561 	C ebx	scratch (nadj, q1)
    562 	C ecx	scratch (dst)
    563 	C edx	scratch
    564 	C esi	n10
    565 	C edi	n2
    566 	C ebp	divisor
    567 	C
    568 	C mm7	rshift
    569 
    570 
    571 	movl	%esi, %eax
    572 	movl	%ebp, %ebx
    573 
    574 	sarl	$31, %eax          C -n1
    575 	movl	VAR_DST_STOP, %ecx
    576 
    577 	andl	%eax, %ebx         C -n1 & d
    578 	negl	%eax               C n1
    579 
    580 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
    581 	addl	%edi, %eax         C n2+n1
    582 
    583 	mull	VAR_INVERSE        C m*(n2+n1)
    584 
    585 	C
    586 
    587 	C
    588 
    589 	C
    590 
    591 	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
    592 	leal	1(%edi), %ebx      C n2+1
    593 	movl	%ebp, %eax	   C d
    594 
    595 	C
    596 
    597 	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
    598 
    599 	sbbl	$0, %ebx           C q1 if q1+1 overflowed
    600 
    601 	mull	%ebx
    602 
    603 	C
    604 
    605 	C
    606 
    607 	C
    608 
    609 	C
    610 
    611 	subl	%eax, %esi
    612 	movl	PARAM_XSIZE, %eax
    613 
    614 	sbbl	%edx, %edi	   C n - (q1+1)*d
    615 	movl	%esi, %edi	   C remainder -> n2
    616 	leal	(%ebp,%esi), %edx
    617 
    618 	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
    619 
    620 	sbbl	$0, %ebx	   C q
    621 
    622 	movl	%ebx, -8(%ecx)
    623 	subl	$8, %ecx
    624 
    625 
    626 
    627 	orl	%eax, %eax         C xsize
    628 	jnz	L(fraction_some)
    629 
    630 	movl	%edi, %eax
    631 L(fraction_done):
    632 	movl	VAR_NORM, %ecx
    633 L(zero_done):
    634 	movl	SAVE_EBP, %ebp
    635 
    636 	movl	SAVE_EDI, %edi
    637 
    638 	movl	SAVE_ESI, %esi
    639 
    640 	movl	SAVE_EBX, %ebx
    641 	addl	$STACK_SPACE, %esp
    642 
    643 	shrl	%cl, %eax
    644 	emms
    645 
    646 	ret
    647 
    648 
    649 C -----------------------------------------------------------------------------
    650 C
    651 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
    652 C of q*d is simply -d and the remainder n-q*d = n10+d
    653 
    654 L(q1_ff):
    655 	C eax	(divisor)
    656 	C ebx	(q1+1 == 0)
    657 	C ecx
    658 	C edx
    659 	C esi	n10
    660 	C edi	n2
    661 	C ebp	divisor
    662 
    663 	movl	VAR_DST, %ecx
    664 	movl	VAR_DST_STOP, %edx
    665 	subl	$4, %ecx
    666 
    667 	movl	%ecx, VAR_DST
    668 	psrlq	%mm7, %mm0
    669 	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
    670 
    671 	movl	$-1, (%ecx)
    672 	movd	%mm0, %esi		C next n10
    673 
    674 	cmpl	%ecx, %edx
    675 	jne	L(integer_top)
    676 
    677 	jmp	L(integer_loop_done)
    678 
    679 
    680 
    681 C -----------------------------------------------------------------------------
    682 C
    683 C In the current implementation, the following successively dependent
    684 C micro-ops seem to exist.
    685 C
    686 C		       uops
    687 C		mul	5
    688 C		q1+1	1   (addl)
    689 C		mul	5
    690 C		sub	3   (negl/sbbl)
    691 C		addback	2   (cmov)
    692 C		       ---
    693 C		       16
    694 C
    695 C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
    696 C the addback was found to be a touch slower.
    697 
    698 
    699 	ALIGN(16)
    700 L(fraction_some):
    701 	C eax
    702 	C ebx
    703 	C ecx
    704 	C edx
    705 	C esi
    706 	C edi	carry
    707 	C ebp	divisor
    708 
    709 	movl	PARAM_DST, %esi
    710 	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
    711 	movl	%edi, %eax
    712 
    713 	subl	$8, %ecx		C &dst[xsize]
    714 
    715 
    716 	ALIGN(16)
    717 L(fraction_top):
    718 	C eax	n2, then scratch
    719 	C ebx	scratch (nadj, q1)
    720 	C ecx	dst, decrementing
    721 	C edx	scratch
    722 	C esi	dst stop point
    723 	C edi	n2
    724 	C ebp	divisor
    725 
    726 	mull	VAR_INVERSE	C m*n2
    727 
    728 	movl	%ebp, %eax	C d
    729 	subl	$4, %ecx	C dst
    730 	leal	1(%edi), %ebx
    731 
    732 	C
    733 
    734 	C
    735 
    736 	C
    737 
    738 	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
    739 
    740 	mull	%ebx		C (q1+1)*d
    741 
    742 	C
    743 
    744 	C
    745 
    746 	C
    747 
    748 	C
    749 
    750 	negl	%eax		C low of n - (q1+1)*d
    751 
    752 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
    753 	leal	(%ebp,%eax), %edx
    754 
    755 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
    756 
    757 	sbbl	$0, %ebx	C q
    758 	movl	%eax, %edi	C remainder->n2
    759 	cmpl	%esi, %ecx
    760 
    761 	movl	%ebx, (%ecx)	C previous q
    762 	jne	L(fraction_top)
    763 
    764 
    765 	jmp	L(fraction_done)
    766 
    767 EPILOGUE()
    768