Home | History | Annotate | Line # | Download | only in k7
      1      1.1  mrg dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C          cycles/limb
     35      1.1  mrg C Athlon:     11.0
     36      1.1  mrg C Hammer:      9.0
     37      1.1  mrg 
     38      1.1  mrg 
     39      1.1  mrg C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     40      1.1  mrg C                      mp_limb_t divisor);
     41      1.1  mrg C
     42      1.1  mrg C The dependent chain is mul+imul+sub for 11 cycles and that speed is
     43      1.1  mrg C achieved with no special effort.  The load and shrld latencies are hidden
     44      1.1  mrg C by out of order execution.
     45      1.1  mrg C
     46      1.1  mrg C It's a touch faster on size==1 to use the mul-by-inverse than divl.
     47      1.1  mrg 
     48      1.1  mrg defframe(PARAM_DIVISOR,16)
     49      1.1  mrg defframe(PARAM_SIZE,   12)
     50      1.1  mrg defframe(PARAM_SRC,    8)
     51      1.1  mrg defframe(PARAM_DST,    4)
     52      1.1  mrg 
     53      1.1  mrg defframe(SAVE_EBX,     -4)
     54      1.1  mrg defframe(SAVE_ESI,     -8)
     55      1.1  mrg defframe(SAVE_EDI,    -12)
     56      1.1  mrg defframe(SAVE_EBP,    -16)
     57      1.1  mrg defframe(VAR_INVERSE, -20)
     58      1.1  mrg defframe(VAR_DST_END, -24)
     59      1.1  mrg 
     60      1.1  mrg deflit(STACK_SPACE, 24)
     61      1.1  mrg 
     62      1.1  mrg 	TEXT
     63      1.1  mrg 
     64      1.1  mrg 	ALIGN(16)
     65      1.1  mrg PROLOGUE(mpn_divexact_1)
     66      1.1  mrg deflit(`FRAME',0)
     67      1.1  mrg 
     68      1.1  mrg 	movl	PARAM_DIVISOR, %eax
     69      1.1  mrg 	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
     70      1.1  mrg 	movl	$-1, %ecx		C shift count
     71      1.1  mrg 
     72      1.1  mrg 	movl	%ebp, SAVE_EBP
     73      1.1  mrg 	movl	PARAM_SIZE, %ebp
     74      1.1  mrg 
     75      1.1  mrg 	movl	%esi, SAVE_ESI
     76      1.1  mrg 	movl	%edi, SAVE_EDI
     77      1.1  mrg 
     78      1.1  mrg 	C If there's usually only one or two trailing zero bits then this
     79      1.1  mrg 	C should be faster than bsfl.
     80      1.1  mrg L(strip_twos):
     81      1.1  mrg 	incl	%ecx
     82      1.1  mrg 	shrl	%eax
     83      1.1  mrg 	jnc	L(strip_twos)
     84      1.1  mrg 
     85      1.1  mrg 	movl	%ebx, SAVE_EBX
     86      1.1  mrg 	leal	1(%eax,%eax), %ebx	C d without twos
     87      1.1  mrg 	andl	$127, %eax		C d/2, 7 bits
     88      1.1  mrg 
     89      1.1  mrg ifdef(`PIC',`
     90      1.1  mrg 	LEA(	binvert_limb_table, %edx)
     91      1.1  mrg 	movzbl	(%eax,%edx), %eax		C inv 8 bits
     92      1.1  mrg ',`
     93      1.1  mrg 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
     94      1.1  mrg ')
     95      1.1  mrg 
     96      1.1  mrg 	leal	(%eax,%eax), %edx	C 2*inv
     97      1.1  mrg 	movl	%ebx, PARAM_DIVISOR	C d without twos
     98      1.1  mrg 
     99      1.1  mrg 	imull	%eax, %eax		C inv*inv
    100      1.1  mrg 
    101      1.1  mrg 	movl	PARAM_SRC, %esi
    102      1.1  mrg 	movl	PARAM_DST, %edi
    103      1.1  mrg 
    104      1.1  mrg 	imull	%ebx, %eax		C inv*inv*d
    105      1.1  mrg 
    106      1.1  mrg 	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
    107      1.1  mrg 	leal	(%edx,%edx), %eax	C 2*inv
    108      1.1  mrg 
    109      1.1  mrg 	imull	%edx, %edx		C inv*inv
    110      1.1  mrg 
    111      1.1  mrg 	leal	(%esi,%ebp,4), %esi	C src end
    112      1.1  mrg 	leal	(%edi,%ebp,4), %edi	C dst end
    113      1.1  mrg 	negl	%ebp			C -size
    114      1.1  mrg 
    115      1.1  mrg 	imull	%ebx, %edx		C inv*inv*d
    116      1.1  mrg 
    117      1.1  mrg 	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
    118      1.1  mrg 
    119      1.1  mrg 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    120      1.1  mrg 	pushl	%eax	FRAME_pushl()
    121      1.1  mrg 	imull	PARAM_DIVISOR, %eax
    122      1.1  mrg 	cmpl	$1, %eax
    123      1.1  mrg 	popl	%eax	FRAME_popl()')
    124      1.1  mrg 
    125      1.1  mrg 	movl	%eax, VAR_INVERSE
    126      1.1  mrg 	movl	(%esi,%ebp,4), %eax	C src[0]
    127      1.1  mrg 
    128      1.1  mrg 	incl	%ebp
    129      1.1  mrg 	jz	L(one)
    130      1.1  mrg 
    131      1.1  mrg 	movl	(%esi,%ebp,4), %edx	C src[1]
    132      1.1  mrg 
    133      1.1  mrg 	shrdl(	%cl, %edx, %eax)
    134      1.1  mrg 
    135      1.1  mrg 	movl	%edi, VAR_DST_END
    136      1.1  mrg 	xorl	%ebx, %ebx
    137      1.1  mrg 	jmp	L(entry)
    138      1.1  mrg 
    139      1.1  mrg 	ALIGN(8)
    140      1.1  mrg L(top):
    141      1.1  mrg 	C eax	q
    142      1.1  mrg 	C ebx	carry bit, 0 or 1
    143      1.1  mrg 	C ecx	shift
    144      1.1  mrg 	C edx
    145      1.1  mrg 	C esi	src end
    146      1.1  mrg 	C edi	dst end
    147      1.1  mrg 	C ebp	counter, limbs, negative
    148      1.1  mrg 
    149      1.1  mrg 	mull	PARAM_DIVISOR		C carry limb in edx
    150      1.1  mrg 
    151      1.1  mrg 	movl	-4(%esi,%ebp,4), %eax
    152      1.1  mrg 	movl	(%esi,%ebp,4), %edi
    153      1.1  mrg 
    154      1.1  mrg 	shrdl(	%cl, %edi, %eax)
    155      1.1  mrg 
    156      1.1  mrg 	subl	%ebx, %eax		C apply carry bit
    157      1.1  mrg 	setc	%bl
    158      1.1  mrg 	movl	VAR_DST_END, %edi
    159      1.1  mrg 
    160      1.1  mrg 	subl	%edx, %eax		C apply carry limb
    161      1.1  mrg 	adcl	$0, %ebx
    162      1.1  mrg 
    163      1.1  mrg L(entry):
    164      1.1  mrg 	imull	VAR_INVERSE, %eax
    165      1.1  mrg 
    166      1.1  mrg 	movl	%eax, -4(%edi,%ebp,4)
    167      1.1  mrg 	incl	%ebp
    168      1.1  mrg 	jnz	L(top)
    169      1.1  mrg 
    170      1.1  mrg 
    171      1.1  mrg 	mull	PARAM_DIVISOR		C carry limb in edx
    172      1.1  mrg 
    173      1.1  mrg 	movl	-4(%esi), %eax		C src high limb
    174      1.1  mrg 	shrl	%cl, %eax
    175      1.1  mrg 	movl	SAVE_ESI, %esi
    176      1.1  mrg 
    177      1.1  mrg 	subl	%ebx, %eax		C apply carry bit
    178      1.1  mrg 	movl	SAVE_EBX, %ebx
    179      1.1  mrg 	movl	SAVE_EBP, %ebp
    180      1.1  mrg 
    181      1.1  mrg 	subl	%edx, %eax		C apply carry limb
    182      1.1  mrg 
    183      1.1  mrg 	imull	VAR_INVERSE, %eax
    184      1.1  mrg 
    185      1.1  mrg 	movl	%eax, -4(%edi)
    186      1.1  mrg 	movl	SAVE_EDI, %edi
    187      1.1  mrg 	addl	$STACK_SPACE, %esp
    188      1.1  mrg 
    189      1.1  mrg 	ret
    190      1.1  mrg 
    191      1.1  mrg 
    192      1.1  mrg L(one):
    193      1.1  mrg 	shrl	%cl, %eax
    194      1.1  mrg 	movl	SAVE_ESI, %esi
    195      1.1  mrg 	movl	SAVE_EBX, %ebx
    196      1.1  mrg 
    197      1.1  mrg 	imull	VAR_INVERSE, %eax
    198      1.1  mrg 
    199      1.1  mrg 	movl	SAVE_EBP, %ebp
    200      1.1  mrg 	movl	%eax, -4(%edi)
    201      1.1  mrg 
    202      1.1  mrg 	movl	SAVE_EDI, %edi
    203      1.1  mrg 	addl	$STACK_SPACE, %esp
    204      1.1  mrg 
    205      1.1  mrg 	ret
    206      1.1  mrg 
    207      1.1  mrg EPILOGUE()
    208  1.1.1.2  mrg ASM_END()
    209