Home | History | Annotate | Line # | Download | only in pentium
      1 dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
      2 
      3 dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
      4 
      5 dnl  Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 
     36 C         divisor
     37 C       odd   even
     38 C P54:  24.5  30.5   cycles/limb
     39 C P55:  23.0  28.0
     40 
     41 MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
     42 
     43 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
     44 C expected.  On P54 in the even case the shrdl pairing nonsense (see
     45 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
     46 C further 1.5 slowdown for both odd and even.
     47 
     48 defframe(PARAM_SHIFT,  24)
     49 defframe(PARAM_INVERSE,20)
     50 defframe(PARAM_DIVISOR,16)
     51 defframe(PARAM_SIZE,   12)
     52 defframe(PARAM_SRC,    8)
     53 defframe(PARAM_DST,    4)
     54 
     55 dnl  re-use parameter space
     56 define(VAR_INVERSE,`PARAM_DST')
     57 
     58 	TEXT
     59 
     60 	ALIGN(32)
     61 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
     62 C                           mp_limb_t divisor);
     63 C
     64 PROLOGUE(mpn_bdiv_q_1)
     65 deflit(`FRAME',0)
     66 
     67 	movl	$-1, %ecx
     68 	movl	PARAM_DIVISOR, %eax
     69 
     70 L(strip_twos):
     71 	ASSERT(nz, `orl %eax, %eax')
     72 	shrl	%eax
     73 	incl	%ecx			C shift count
     74 
     75 	jnc	L(strip_twos)
     76 
     77 	leal	1(%eax,%eax), %edx	C d
     78 	andl	$127, %eax		C d/2, 7 bits
     79 
     80 	pushl	%ebx		FRAME_pushl()
     81 	pushl	%ebp		FRAME_pushl()
     82 
     83 ifdef(`PIC',`
     84 ifdef(`DARWIN',`
     85 	LEA(	binvert_limb_table, %ebp)
     86 	movzbl	(%eax,%ebp), %eax
     87 ',`
     88 	call	L(here)
     89 L(here):
     90 	popl	%ebp			C eip
     91 
     92 	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
     93 	C AGI
     94 	movl	binvert_limb_table@GOT(%ebp), %ebp
     95 	C AGI
     96 	movzbl	(%eax,%ebp), %eax
     97 ')
     98 ',`
     99 
    100 dnl non-PIC
    101 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
    102 ')
    103 
    104 	movl	%eax, %ebp		C inv
    105 	addl	%eax, %eax		C 2*inv
    106 
    107 	imull	%ebp, %ebp		C inv*inv
    108 
    109 	imull	%edx, %ebp		C inv*inv*d
    110 
    111 	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
    112 	movl	PARAM_SIZE, %ebx
    113 
    114 	movl	%eax, %ebp
    115 	addl	%eax, %eax		C 2*inv
    116 
    117 	imull	%ebp, %ebp		C inv*inv
    118 
    119 	imull	%edx, %ebp		C inv*inv*d
    120 
    121 	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
    122 	movl	%edx, PARAM_DIVISOR	C d without twos
    123 
    124 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    125 	pushl	%eax	FRAME_pushl()
    126 	imull	PARAM_DIVISOR, %eax
    127 	cmpl	$1, %eax
    128 	popl	%eax	FRAME_popl()')
    129 
    130 	jmp	L(common)
    131 EPILOGUE()
    132 
    133 C mp_limb_t
    134 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
    135 C		    mp_limb_t inverse, int shift)
    136 	ALIGN(32)
    137 PROLOGUE(mpn_pi1_bdiv_q_1)
    138 deflit(`FRAME',0)
    139 
    140 	movl	PARAM_SHIFT, %ecx
    141 
    142 	pushl	%ebx		FRAME_pushl()
    143 	pushl	%ebp		FRAME_pushl()
    144 
    145 	movl	PARAM_SIZE, %ebx
    146 	movl	PARAM_INVERSE, %eax
    147 
    148 L(common):
    149 	pushl	%esi		FRAME_pushl()
    150 	push	%edi		FRAME_pushl()
    151 
    152 	movl	PARAM_SRC, %esi
    153 	movl	PARAM_DST, %edi
    154 	movl	%eax, VAR_INVERSE
    155 
    156 	leal	(%esi,%ebx,4), %esi	C src end
    157 	leal	(%edi,%ebx,4), %edi	C dst end
    158 
    159 	negl	%ebx			C -size
    160 
    161 	xorl	%ebp, %ebp		C initial carry bit
    162 
    163 	orl	%ecx, %ecx		C shift
    164 	movl	(%esi,%ebx,4), %eax	C src low limb
    165 	jz	L(odd_entry)
    166 
    167 	xorl	%edx, %edx		C initial carry limb (for even, if one)
    168 	incl	%ebx
    169 	jz	L(one)
    170 
    171 	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
    172 	shrdl(	%cl, %edx, %eax)
    173 
    174 	jmp	L(even_entry)
    175 
    176 
    177 	ALIGN(8)
    178 L(odd_top):
    179 	C eax	scratch
    180 	C ebx	counter, limbs, negative
    181 	C ecx
    182 	C edx
    183 	C esi	src end
    184 	C edi	dst end
    185 	C ebp	carry bit, 0 or -1
    186 
    187 	mull	PARAM_DIVISOR
    188 
    189 	movl	(%esi,%ebx,4), %eax
    190 	subl	%ebp, %edx
    191 
    192 	subl	%edx, %eax
    193 
    194 	sbbl	%ebp, %ebp
    195 
    196 L(odd_entry):
    197 	imull	VAR_INVERSE, %eax
    198 
    199 	movl	%eax, (%edi,%ebx,4)
    200 
    201 	incl	%ebx
    202 	jnz	L(odd_top)
    203 
    204 	popl	%edi
    205 	popl	%esi
    206 
    207 	popl	%ebp
    208 	popl	%ebx
    209 
    210 	ret
    211 
    212 L(even_top):
    213 	C eax	scratch
    214 	C ebx	counter, limbs, negative
    215 	C ecx	twos
    216 	C edx
    217 	C esi	src end
    218 	C edi	dst end
    219 	C ebp	carry bit, 0 or -1
    220 
    221 	mull	PARAM_DIVISOR
    222 
    223 	subl	%ebp, %edx		C carry bit
    224 	movl	-4(%esi,%ebx,4), %eax	C src limb
    225 
    226 	movl	(%esi,%ebx,4), %ebp	C and one above it
    227 
    228 	shrdl(	%cl, %ebp, %eax)
    229 
    230 	subl	%edx, %eax		C carry limb
    231 
    232 	sbbl	%ebp, %ebp
    233 
    234 L(even_entry):
    235 	imull	VAR_INVERSE, %eax
    236 
    237 	movl	%eax, -4(%edi,%ebx,4)
    238 	incl	%ebx
    239 
    240 	jnz	L(even_top)
    241 
    242 	mull	PARAM_DIVISOR
    243 
    244 	movl	-4(%esi), %eax		C src high limb
    245 	subl	%ebp, %edx
    246 
    247 L(one):
    248 	shrl	%cl, %eax
    249 
    250 	subl	%edx, %eax		C no carry if division is exact
    251 
    252 	imull	VAR_INVERSE, %eax
    253 
    254 	movl	%eax, -4(%edi)		C dst high limb
    255 	nop				C protect against cache bank clash
    256 
    257 	popl	%edi
    258 	popl	%esi
    259 
    260 	popl	%ebp
    261 	popl	%ebx
    262 
    263 	ret
    264 
    265 EPILOGUE()
    266 ASM_END()
    267