Home | History | Annotate | Line # | Download | only in sse2
mode1o.asm revision 1.1.1.3
      1 dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
      2 
      3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C P4: 19.0 cycles/limb
     35 
     36 
     37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     38 C                               mp_limb_t divisor);
     39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     40 C                                mp_limb_t divisor, mp_limb_t carry);
     41 C
     42 
     43 defframe(PARAM_CARRY,  16)
     44 defframe(PARAM_DIVISOR,12)
     45 defframe(PARAM_SIZE,   8)
     46 defframe(PARAM_SRC,    4)
     47 
     48 	TEXT
     49 
     50 	ALIGN(16)
     51 PROLOGUE(mpn_modexact_1c_odd)
     52 deflit(`FRAME',0)
     53 
     54 	movd	PARAM_CARRY, %mm1
     55 	jmp	L(start_1c)
     56 
     57 EPILOGUE()
     58 
     59 
     60 	ALIGN(16)
     61 PROLOGUE(mpn_modexact_1_odd)
     62 deflit(`FRAME',0)
     63 
     64 	pxor	%mm1, %mm1		C carry limb
     65 L(start_1c):
     66 	movl	PARAM_DIVISOR, %eax
     67 
     68 	movd	PARAM_DIVISOR, %mm7
     69 
     70 	shrl	%eax
     71 
     72 	andl	$127, %eax		C d/2, 7 bits
     73 
     74 ifdef(`PIC',`
     75 	LEA(	binvert_limb_table, %edx)
     76 	movzbl	(%eax,%edx), %eax		C inv 8 bits
     77 ',`
     78 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
     79 ')
     80 
     81 	C
     82 
     83 	movd	%eax, %mm6		C inv
     84 
     85 	movd	%eax, %mm0		C inv
     86 
     87 	pmuludq	%mm6, %mm6		C inv*inv
     88 
     89 	C
     90 
     91 	pmuludq	%mm7, %mm6		C inv*inv*d
     92 	paddd	%mm0, %mm0		C 2*inv
     93 
     94 	C
     95 
     96 	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
     97 	pxor	%mm6, %mm6
     98 
     99 	paddd	%mm0, %mm6
    100 	pmuludq	%mm0, %mm0		C inv*inv
    101 
    102 	C
    103 
    104 	pmuludq	%mm7, %mm0		C inv*inv*d
    105 	paddd	%mm6, %mm6		C 2*inv
    106 
    107 
    108 	movl	PARAM_SRC, %eax
    109 	movl	PARAM_SIZE, %ecx
    110 
    111 	C
    112 
    113 	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
    114 
    115 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    116 	pushl	%eax	FRAME_pushl()
    117 	movd	%mm6, %eax
    118 	imul	PARAM_DIVISOR, %eax
    119 	cmpl	$1, %eax
    120 	popl	%eax	FRAME_popl()')
    121 
    122 	pxor	%mm0, %mm0		C carry bit
    123 
    124 
    125 C The dependent chain here is as follows.
    126 C
    127 C					latency
    128 C	psubq	 s = (src-cbit) - climb	   2
    129 C	pmuludq	 q = s*inverse		   8
    130 C	pmuludq	 prod = q*divisor	   8
    131 C	psrlq	 climb = high(prod)	   2
    132 C					  --
    133 C					  20
    134 C
    135 C Yet the loop measures 19.0 c/l, so obviously there's something gained
    136 C there over a straight reading of the chip documentation.
    137 
    138 L(top):
    139 	C eax	src, incrementing
    140 	C ebx
    141 	C ecx	counter, limbs
    142 	C edx
    143 	C
    144 	C mm0	carry bit
    145 	C mm1	carry limb
    146 	C mm6	inverse
    147 	C mm7	divisor
    148 
    149 	movd	(%eax), %mm2
    150 	addl	$4, %eax
    151 
    152 	psubq	%mm0, %mm2		C src - cbit
    153 
    154 	psubq	%mm1, %mm2		C src - cbit - climb
    155 	movq	%mm2, %mm0
    156 	psrlq	$63, %mm0		C new cbit
    157 
    158 	pmuludq	%mm6, %mm2		C s*inverse
    159 
    160 	movq	%mm7, %mm1
    161 	pmuludq	%mm2, %mm1		C q*divisor
    162 	psrlq	$32, %mm1		C new climb
    163 
    164 	subl	$1, %ecx
    165 	jnz	L(top)
    166 
    167 
    168 L(done):
    169 	paddq	%mm1, %mm0
    170 	movd	%mm0, %eax
    171 	emms
    172 	ret
    173 
    174 EPILOGUE()
    175 ASM_END()
    176