Home | History | Annotate | Line # | Download | only in sse2
      1      1.1  mrg dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
      4  1.1.1.3  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.3  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.3  mrg dnl  it under the terms of either:
      9  1.1.1.3  mrg dnl
     10  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.3  mrg dnl      option) any later version.
     13  1.1.1.3  mrg dnl
     14  1.1.1.3  mrg dnl  or
     15  1.1.1.3  mrg dnl
     16  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.3  mrg dnl      later version.
     19  1.1.1.3  mrg dnl
     20  1.1.1.3  mrg dnl  or both in parallel, as here.
     21  1.1.1.3  mrg dnl
     22  1.1.1.3  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.3  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.3  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C P4: 19.0 cycles/limb
     35      1.1  mrg 
     36      1.1  mrg 
     37      1.1  mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     38      1.1  mrg C                               mp_limb_t divisor);
     39      1.1  mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     40      1.1  mrg C                                mp_limb_t divisor, mp_limb_t carry);
     41      1.1  mrg C
     42      1.1  mrg 
     43      1.1  mrg defframe(PARAM_CARRY,  16)
     44      1.1  mrg defframe(PARAM_DIVISOR,12)
     45      1.1  mrg defframe(PARAM_SIZE,   8)
     46      1.1  mrg defframe(PARAM_SRC,    4)
     47      1.1  mrg 
     48      1.1  mrg 	TEXT
     49      1.1  mrg 
     50      1.1  mrg 	ALIGN(16)
     51      1.1  mrg PROLOGUE(mpn_modexact_1c_odd)
     52      1.1  mrg deflit(`FRAME',0)
     53      1.1  mrg 
     54      1.1  mrg 	movd	PARAM_CARRY, %mm1
     55      1.1  mrg 	jmp	L(start_1c)
     56      1.1  mrg 
     57      1.1  mrg EPILOGUE()
     58      1.1  mrg 
     59      1.1  mrg 
     60      1.1  mrg 	ALIGN(16)
     61      1.1  mrg PROLOGUE(mpn_modexact_1_odd)
     62      1.1  mrg deflit(`FRAME',0)
     63      1.1  mrg 
     64      1.1  mrg 	pxor	%mm1, %mm1		C carry limb
     65      1.1  mrg L(start_1c):
     66      1.1  mrg 	movl	PARAM_DIVISOR, %eax
     67      1.1  mrg 
     68      1.1  mrg 	movd	PARAM_DIVISOR, %mm7
     69      1.1  mrg 
     70      1.1  mrg 	shrl	%eax
     71      1.1  mrg 
     72      1.1  mrg 	andl	$127, %eax		C d/2, 7 bits
     73      1.1  mrg 
     74      1.1  mrg ifdef(`PIC',`
     75      1.1  mrg 	LEA(	binvert_limb_table, %edx)
     76      1.1  mrg 	movzbl	(%eax,%edx), %eax		C inv 8 bits
     77      1.1  mrg ',`
     78      1.1  mrg 	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
     79      1.1  mrg ')
     80      1.1  mrg 
     81      1.1  mrg 	C
     82      1.1  mrg 
     83      1.1  mrg 	movd	%eax, %mm6		C inv
     84      1.1  mrg 
     85      1.1  mrg 	movd	%eax, %mm0		C inv
     86      1.1  mrg 
     87      1.1  mrg 	pmuludq	%mm6, %mm6		C inv*inv
     88      1.1  mrg 
     89      1.1  mrg 	C
     90      1.1  mrg 
     91      1.1  mrg 	pmuludq	%mm7, %mm6		C inv*inv*d
     92      1.1  mrg 	paddd	%mm0, %mm0		C 2*inv
     93      1.1  mrg 
     94      1.1  mrg 	C
     95      1.1  mrg 
     96      1.1  mrg 	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
     97      1.1  mrg 	pxor	%mm6, %mm6
     98      1.1  mrg 
     99      1.1  mrg 	paddd	%mm0, %mm6
    100      1.1  mrg 	pmuludq	%mm0, %mm0		C inv*inv
    101      1.1  mrg 
    102      1.1  mrg 	C
    103      1.1  mrg 
    104      1.1  mrg 	pmuludq	%mm7, %mm0		C inv*inv*d
    105      1.1  mrg 	paddd	%mm6, %mm6		C 2*inv
    106      1.1  mrg 
    107      1.1  mrg 
    108      1.1  mrg 	movl	PARAM_SRC, %eax
    109      1.1  mrg 	movl	PARAM_SIZE, %ecx
    110      1.1  mrg 
    111      1.1  mrg 	C
    112      1.1  mrg 
    113      1.1  mrg 	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
    114      1.1  mrg 
    115      1.1  mrg 	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
    116      1.1  mrg 	pushl	%eax	FRAME_pushl()
    117      1.1  mrg 	movd	%mm6, %eax
    118      1.1  mrg 	imul	PARAM_DIVISOR, %eax
    119      1.1  mrg 	cmpl	$1, %eax
    120      1.1  mrg 	popl	%eax	FRAME_popl()')
    121      1.1  mrg 
    122      1.1  mrg 	pxor	%mm0, %mm0		C carry bit
    123      1.1  mrg 
    124      1.1  mrg 
    125      1.1  mrg C The dependent chain here is as follows.
    126      1.1  mrg C
    127  1.1.1.2  mrg C					latency
    128  1.1.1.2  mrg C	psubq	 s = (src-cbit) - climb	   2
    129  1.1.1.2  mrg C	pmuludq	 q = s*inverse		   8
    130  1.1.1.2  mrg C	pmuludq	 prod = q*divisor	   8
    131  1.1.1.2  mrg C	psrlq	 climb = high(prod)	   2
    132  1.1.1.2  mrg C					  --
    133  1.1.1.2  mrg C					  20
    134      1.1  mrg C
    135      1.1  mrg C Yet the loop measures 19.0 c/l, so obviously there's something gained
    136      1.1  mrg C there over a straight reading of the chip documentation.
    137      1.1  mrg 
    138      1.1  mrg L(top):
    139      1.1  mrg 	C eax	src, incrementing
    140      1.1  mrg 	C ebx
    141      1.1  mrg 	C ecx	counter, limbs
    142      1.1  mrg 	C edx
    143      1.1  mrg 	C
    144      1.1  mrg 	C mm0	carry bit
    145      1.1  mrg 	C mm1	carry limb
    146      1.1  mrg 	C mm6	inverse
    147      1.1  mrg 	C mm7	divisor
    148      1.1  mrg 
    149      1.1  mrg 	movd	(%eax), %mm2
    150      1.1  mrg 	addl	$4, %eax
    151      1.1  mrg 
    152      1.1  mrg 	psubq	%mm0, %mm2		C src - cbit
    153      1.1  mrg 
    154      1.1  mrg 	psubq	%mm1, %mm2		C src - cbit - climb
    155      1.1  mrg 	movq	%mm2, %mm0
    156      1.1  mrg 	psrlq	$63, %mm0		C new cbit
    157      1.1  mrg 
    158      1.1  mrg 	pmuludq	%mm6, %mm2		C s*inverse
    159      1.1  mrg 
    160      1.1  mrg 	movq	%mm7, %mm1
    161      1.1  mrg 	pmuludq	%mm2, %mm1		C q*divisor
    162      1.1  mrg 	psrlq	$32, %mm1		C new climb
    163      1.1  mrg 
    164      1.1  mrg 	subl	$1, %ecx
    165      1.1  mrg 	jnz	L(top)
    166      1.1  mrg 
    167      1.1  mrg 
    168      1.1  mrg L(done):
    169      1.1  mrg 	paddq	%mm1, %mm0
    170      1.1  mrg 	movd	%mm0, %eax
    171      1.1  mrg 	emms
    172      1.1  mrg 	ret
    173      1.1  mrg 
    174      1.1  mrg EPILOGUE()
    175  1.1.1.3  mrg ASM_END()
    176