Home | History | Annotate | Line # | Download | only in sse2
mode1o.asm revision 1.1.1.1.2.1
      1 dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
      2 
      3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
      4 dnl
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or
      8 dnl  modify it under the terms of the GNU Lesser General Public License as
      9 dnl  published by the Free Software Foundation; either version 3 of the
     10 dnl  License, or (at your option) any later version.
     11 dnl
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 dnl  Lesser General Public License for more details.
     16 dnl
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C P4: 19.0 cycles/limb
     24 
     25 
     26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     27 C                               mp_limb_t divisor);
     28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     29 C                                mp_limb_t divisor, mp_limb_t carry);
     30 C
     31 
     32 defframe(PARAM_CARRY,  16)
     33 defframe(PARAM_DIVISOR,12)
     34 defframe(PARAM_SIZE,   8)
     35 defframe(PARAM_SRC,    4)
     36 
     37 	TEXT
     38 
     39 	ALIGN(16)
     40 PROLOGUE(mpn_modexact_1c_odd)
     41 deflit(`FRAME',0)
     42 
     43 	movd	PARAM_CARRY, %mm1
     44 	jmp	L(start_1c)
     45 
     46 EPILOGUE()
     47 
     48 
     49 	ALIGN(16)
     50 PROLOGUE(mpn_modexact_1_odd)
     51 deflit(`FRAME',0)
     52 
     53 	pxor	%mm1, %mm1		C carry limb
     54 L(start_1c):
     55 	movl	PARAM_DIVISOR, %eax
     56 
     57 	movd	PARAM_DIVISOR, %mm7
     58 
     59 	shrl	%eax
     60 
     61 	andl	$127, %eax		C d/2, 7 bits
     62 
     63 ifdef(`PIC',`
     64 	LEA(	binvert_limb_table, %edx)
     65 	movzbl	(%eax,%edx), %eax		C inv 8 bits
     66 ',`
     67 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
     68 ')
     69 
     70 	C
     71 
     72 	movd	%eax, %mm6		C inv
     73 
     74 	movd	%eax, %mm0		C inv
     75 
     76 	pmuludq	%mm6, %mm6		C inv*inv
     77 
     78 	C
     79 
     80 	pmuludq	%mm7, %mm6		C inv*inv*d
     81 	paddd	%mm0, %mm0		C 2*inv
     82 
     83 	C
     84 
     85 	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
     86 	pxor	%mm6, %mm6
     87 
     88 	paddd	%mm0, %mm6
     89 	pmuludq	%mm0, %mm0		C inv*inv
     90 
     91 	C
     92 
     93 	pmuludq	%mm7, %mm0		C inv*inv*d
     94 	paddd	%mm6, %mm6		C 2*inv
     95 
     96 
     97 	movl	PARAM_SRC, %eax
     98 	movl	PARAM_SIZE, %ecx
     99 
    100 	C
    101 
    102 	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
    103 
    104 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    105 	pushl	%eax	FRAME_pushl()
    106 	movd	%mm6, %eax
    107 	imul	PARAM_DIVISOR, %eax
    108 	cmpl	$1, %eax
    109 	popl	%eax	FRAME_popl()')
    110 
    111 	pxor	%mm0, %mm0		C carry bit
    112 
    113 
    114 C The dependent chain here is as follows.
    115 C
    116 C					latency
    117 C	psubq	 s = (src-cbit) - climb	   2
    118 C	pmuludq	 q = s*inverse		   8
    119 C	pmuludq	 prod = q*divisor	   8
    120 C	psrlq	 climb = high(prod)	   2
    121 C					  --
    122 C					  20
    123 C
    124 C Yet the loop measures 19.0 c/l, so obviously there's something gained
    125 C there over a straight reading of the chip documentation.
    126 
    127 L(top):
    128 	C eax	src, incrementing
    129 	C ebx
    130 	C ecx	counter, limbs
    131 	C edx
    132 	C
    133 	C mm0	carry bit
    134 	C mm1	carry limb
    135 	C mm6	inverse
    136 	C mm7	divisor
    137 
    138 	movd	(%eax), %mm2
    139 	addl	$4, %eax
    140 
    141 	psubq	%mm0, %mm2		C src - cbit
    142 
    143 	psubq	%mm1, %mm2		C src - cbit - climb
    144 	movq	%mm2, %mm0
    145 	psrlq	$63, %mm0		C new cbit
    146 
    147 	pmuludq	%mm6, %mm2		C s*inverse
    148 
    149 	movq	%mm7, %mm1
    150 	pmuludq	%mm2, %mm1		C q*divisor
    151 	psrlq	$32, %mm1		C new climb
    152 
    153 	subl	$1, %ecx
    154 	jnz	L(top)
    155 
    156 
    157 L(done):
    158 	paddq	%mm1, %mm0
    159 	movd	%mm0, %eax
    160 	emms
    161 	ret
    162 
    163 EPILOGUE()
    164