Home | History | Annotate | Line # | Download | only in k7
dive_1.asm revision 1.1.1.2
      1 dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
      2 
      3 dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C          cycles/limb
     35 C Athlon:     11.0
     36 C Hammer:      9.0
     37 
     38 
     39 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     40 C                      mp_limb_t divisor);
     41 C
     42 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
     43 C achieved with no special effort.  The load and shrld latencies are hidden
     44 C by out of order execution.
     45 C
     46 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
     47 
     48 defframe(PARAM_DIVISOR,16)
     49 defframe(PARAM_SIZE,   12)
     50 defframe(PARAM_SRC,    8)
     51 defframe(PARAM_DST,    4)
     52 
     53 defframe(SAVE_EBX,     -4)
     54 defframe(SAVE_ESI,     -8)
     55 defframe(SAVE_EDI,    -12)
     56 defframe(SAVE_EBP,    -16)
     57 defframe(VAR_INVERSE, -20)
     58 defframe(VAR_DST_END, -24)
     59 
     60 deflit(STACK_SPACE, 24)
     61 
     62 	TEXT
     63 
     64 	ALIGN(16)
     65 PROLOGUE(mpn_divexact_1)
     66 deflit(`FRAME',0)
     67 
     68 	movl	PARAM_DIVISOR, %eax
     69 	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
     70 	movl	$-1, %ecx		C shift count
     71 
     72 	movl	%ebp, SAVE_EBP
     73 	movl	PARAM_SIZE, %ebp
     74 
     75 	movl	%esi, SAVE_ESI
     76 	movl	%edi, SAVE_EDI
     77 
     78 	C If there's usually only one or two trailing zero bits then this
     79 	C should be faster than bsfl.
     80 L(strip_twos):
     81 	incl	%ecx
     82 	shrl	%eax
     83 	jnc	L(strip_twos)
     84 
     85 	movl	%ebx, SAVE_EBX
     86 	leal	1(%eax,%eax), %ebx	C d without twos
     87 	andl	$127, %eax		C d/2, 7 bits
     88 
     89 ifdef(`PIC',`
     90 	LEA(	binvert_limb_table, %edx)
     91 	movzbl	(%eax,%edx), %eax		C inv 8 bits
     92 ',`
     93 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
     94 ')
     95 
     96 	leal	(%eax,%eax), %edx	C 2*inv
     97 	movl	%ebx, PARAM_DIVISOR	C d without twos
     98 
     99 	imull	%eax, %eax		C inv*inv
    100 
    101 	movl	PARAM_SRC, %esi
    102 	movl	PARAM_DST, %edi
    103 
    104 	imull	%ebx, %eax		C inv*inv*d
    105 
    106 	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
    107 	leal	(%edx,%edx), %eax	C 2*inv
    108 
    109 	imull	%edx, %edx		C inv*inv
    110 
    111 	leal	(%esi,%ebp,4), %esi	C src end
    112 	leal	(%edi,%ebp,4), %edi	C dst end
    113 	negl	%ebp			C -size
    114 
    115 	imull	%ebx, %edx		C inv*inv*d
    116 
    117 	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
    118 
    119 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    120 	pushl	%eax	FRAME_pushl()
    121 	imull	PARAM_DIVISOR, %eax
    122 	cmpl	$1, %eax
    123 	popl	%eax	FRAME_popl()')
    124 
    125 	movl	%eax, VAR_INVERSE
    126 	movl	(%esi,%ebp,4), %eax	C src[0]
    127 
    128 	incl	%ebp
    129 	jz	L(one)
    130 
    131 	movl	(%esi,%ebp,4), %edx	C src[1]
    132 
    133 	shrdl(	%cl, %edx, %eax)
    134 
    135 	movl	%edi, VAR_DST_END
    136 	xorl	%ebx, %ebx
    137 	jmp	L(entry)
    138 
    139 	ALIGN(8)
    140 L(top):
    141 	C eax	q
    142 	C ebx	carry bit, 0 or 1
    143 	C ecx	shift
    144 	C edx
    145 	C esi	src end
    146 	C edi	dst end
    147 	C ebp	counter, limbs, negative
    148 
    149 	mull	PARAM_DIVISOR		C carry limb in edx
    150 
    151 	movl	-4(%esi,%ebp,4), %eax
    152 	movl	(%esi,%ebp,4), %edi
    153 
    154 	shrdl(	%cl, %edi, %eax)
    155 
    156 	subl	%ebx, %eax		C apply carry bit
    157 	setc	%bl
    158 	movl	VAR_DST_END, %edi
    159 
    160 	subl	%edx, %eax		C apply carry limb
    161 	adcl	$0, %ebx
    162 
    163 L(entry):
    164 	imull	VAR_INVERSE, %eax
    165 
    166 	movl	%eax, -4(%edi,%ebp,4)
    167 	incl	%ebp
    168 	jnz	L(top)
    169 
    170 
    171 	mull	PARAM_DIVISOR		C carry limb in edx
    172 
    173 	movl	-4(%esi), %eax		C src high limb
    174 	shrl	%cl, %eax
    175 	movl	SAVE_ESI, %esi
    176 
    177 	subl	%ebx, %eax		C apply carry bit
    178 	movl	SAVE_EBX, %ebx
    179 	movl	SAVE_EBP, %ebp
    180 
    181 	subl	%edx, %eax		C apply carry limb
    182 
    183 	imull	VAR_INVERSE, %eax
    184 
    185 	movl	%eax, -4(%edi)
    186 	movl	SAVE_EDI, %edi
    187 	addl	$STACK_SPACE, %esp
    188 
    189 	ret
    190 
    191 
    192 L(one):
    193 	shrl	%cl, %eax
    194 	movl	SAVE_ESI, %esi
    195 	movl	SAVE_EBX, %ebx
    196 
    197 	imull	VAR_INVERSE, %eax
    198 
    199 	movl	SAVE_EBP, %ebp
    200 	movl	%eax, -4(%edi)
    201 
    202 	movl	SAVE_EDI, %edi
    203 	addl	$STACK_SPACE, %esp
    204 
    205 	ret
    206 
    207 EPILOGUE()
    208 ASM_END()
    209