Home | History | Annotate | Line # | Download | only in k7
dive_1.asm revision 1.1.1.1
      1 dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
      2 
      3 dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
      4 dnl
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or
      8 dnl  modify it under the terms of the GNU Lesser General Public License as
      9 dnl  published by the Free Software Foundation; either version 3 of the
     10 dnl  License, or (at your option) any later version.
     11 dnl
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 dnl  Lesser General Public License for more details.
     16 dnl
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C          cycles/limb
     24 C Athlon:     11.0
     25 C Hammer:      9.0
     26 
     27 
     28 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     29 C                      mp_limb_t divisor);
     30 C
     31 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
     32 C achieved with no special effort.  The load and shrld latencies are hidden
     33 C by out of order execution.
     34 C
     35 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
     36 
     37 defframe(PARAM_DIVISOR,16)
     38 defframe(PARAM_SIZE,   12)
     39 defframe(PARAM_SRC,    8)
     40 defframe(PARAM_DST,    4)
     41 
     42 defframe(SAVE_EBX,     -4)
     43 defframe(SAVE_ESI,     -8)
     44 defframe(SAVE_EDI,    -12)
     45 defframe(SAVE_EBP,    -16)
     46 defframe(VAR_INVERSE, -20)
     47 defframe(VAR_DST_END, -24)
     48 
     49 deflit(STACK_SPACE, 24)
     50 
     51 	TEXT
     52 
     53 	ALIGN(16)
     54 PROLOGUE(mpn_divexact_1)
     55 deflit(`FRAME',0)
     56 
     57 	movl	PARAM_DIVISOR, %eax
     58 	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
     59 	movl	$-1, %ecx		C shift count
     60 
     61 	movl	%ebp, SAVE_EBP
     62 	movl	PARAM_SIZE, %ebp
     63 
     64 	movl	%esi, SAVE_ESI
     65 	movl	%edi, SAVE_EDI
     66 
     67 	C If there's usually only one or two trailing zero bits then this
     68 	C should be faster than bsfl.
     69 L(strip_twos):
     70 	incl	%ecx
     71 	shrl	%eax
     72 	jnc	L(strip_twos)
     73 
     74 	movl	%ebx, SAVE_EBX
     75 	leal	1(%eax,%eax), %ebx	C d without twos
     76 	andl	$127, %eax		C d/2, 7 bits
     77 
     78 ifdef(`PIC',`
     79 	LEA(	binvert_limb_table, %edx)
     80 	movzbl	(%eax,%edx), %eax		C inv 8 bits
     81 ',`
     82 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
     83 ')
     84 
     85 	leal	(%eax,%eax), %edx	C 2*inv
     86 	movl	%ebx, PARAM_DIVISOR	C d without twos
     87 
     88 	imull	%eax, %eax		C inv*inv
     89 
     90 	movl	PARAM_SRC, %esi
     91 	movl	PARAM_DST, %edi
     92 
     93 	imull	%ebx, %eax		C inv*inv*d
     94 
     95 	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
     96 	leal	(%edx,%edx), %eax	C 2*inv
     97 
     98 	imull	%edx, %edx		C inv*inv
     99 
    100 	leal	(%esi,%ebp,4), %esi	C src end
    101 	leal	(%edi,%ebp,4), %edi	C dst end
    102 	negl	%ebp			C -size
    103 
    104 	imull	%ebx, %edx		C inv*inv*d
    105 
    106 	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
    107 
    108 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    109 	pushl	%eax	FRAME_pushl()
    110 	imull	PARAM_DIVISOR, %eax
    111 	cmpl	$1, %eax
    112 	popl	%eax	FRAME_popl()')
    113 
    114 	movl	%eax, VAR_INVERSE
    115 	movl	(%esi,%ebp,4), %eax	C src[0]
    116 
    117 	incl	%ebp
    118 	jz	L(one)
    119 
    120 	movl	(%esi,%ebp,4), %edx	C src[1]
    121 
    122 	shrdl(	%cl, %edx, %eax)
    123 
    124 	movl	%edi, VAR_DST_END
    125 	xorl	%ebx, %ebx
    126 	jmp	L(entry)
    127 
    128 	ALIGN(8)
    129 L(top):
    130 	C eax	q
    131 	C ebx	carry bit, 0 or 1
    132 	C ecx	shift
    133 	C edx
    134 	C esi	src end
    135 	C edi	dst end
    136 	C ebp	counter, limbs, negative
    137 
    138 	mull	PARAM_DIVISOR		C carry limb in edx
    139 
    140 	movl	-4(%esi,%ebp,4), %eax
    141 	movl	(%esi,%ebp,4), %edi
    142 
    143 	shrdl(	%cl, %edi, %eax)
    144 
    145 	subl	%ebx, %eax		C apply carry bit
    146 	setc	%bl
    147 	movl	VAR_DST_END, %edi
    148 
    149 	subl	%edx, %eax		C apply carry limb
    150 	adcl	$0, %ebx
    151 
    152 L(entry):
    153 	imull	VAR_INVERSE, %eax
    154 
    155 	movl	%eax, -4(%edi,%ebp,4)
    156 	incl	%ebp
    157 	jnz	L(top)
    158 
    159 
    160 	mull	PARAM_DIVISOR		C carry limb in edx
    161 
    162 	movl	-4(%esi), %eax		C src high limb
    163 	shrl	%cl, %eax
    164 	movl	SAVE_ESI, %esi
    165 
    166 	subl	%ebx, %eax		C apply carry bit
    167 	movl	SAVE_EBX, %ebx
    168 	movl	SAVE_EBP, %ebp
    169 
    170 	subl	%edx, %eax		C apply carry limb
    171 
    172 	imull	VAR_INVERSE, %eax
    173 
    174 	movl	%eax, -4(%edi)
    175 	movl	SAVE_EDI, %edi
    176 	addl	$STACK_SPACE, %esp
    177 
    178 	ret
    179 
    180 
    181 L(one):
    182 	shrl	%cl, %eax
    183 	movl	SAVE_ESI, %esi
    184 	movl	SAVE_EBX, %ebx
    185 
    186 	imull	VAR_INVERSE, %eax
    187 
    188 	movl	SAVE_EBP, %ebp
    189 	movl	%eax, -4(%edi)
    190 
    191 	movl	SAVE_EDI, %edi
    192 	addl	$STACK_SPACE, %esp
    193 
    194 	ret
    195 
    196 EPILOGUE()
    197