Home | History | Annotate | Line # | Download | only in k7
mode1o.asm revision 1.1.1.1
      1 dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
      2 
      3 dnl  Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
      4 dnl
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or
      8 dnl  modify it under the terms of the GNU Lesser General Public License as
      9 dnl  published by the Free Software Foundation; either version 3 of the
     10 dnl  License, or (at your option) any later version.
     11 dnl
     12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
     13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 dnl  Lesser General Public License for more details.
     16 dnl
     17 dnl  You should have received a copy of the GNU Lesser General Public License
     18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19 
     20 include(`../config.m4')
     21 
     22 
     23 C          cycles/limb
     24 C Athlon:     11.0
     25 C Hammer:      7.0
     26 
     27 
     28 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     29 C                               mp_limb_t divisor);
     30 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     31 C                                mp_limb_t divisor, mp_limb_t carry);
     32 C
     33 C With the loop running at just 11 cycles it doesn't seem worth bothering to
     34 C check for high<divisor to save one step.
     35 C
     36 C Using a divl for size==1 measures slower than the modexact method, which
     37 C is not too surprising since for the latter it's only about 24 cycles to
     38 C calculate the modular inverse.
     39 
     40 defframe(PARAM_CARRY,  16)
     41 defframe(PARAM_DIVISOR,12)
     42 defframe(PARAM_SIZE,   8)
     43 defframe(PARAM_SRC,    4)
     44 
     45 defframe(SAVE_EBX,     -4)
     46 defframe(SAVE_ESI,     -8)
     47 defframe(SAVE_EDI,    -12)
     48 defframe(SAVE_EBP,    -16)
     49 
     50 deflit(STACK_SPACE, 16)
     51 
     52 	TEXT
     53 
     54 	ALIGN(16)
     55 PROLOGUE(mpn_modexact_1c_odd)
     56 deflit(`FRAME',0)
     57 
     58 	movl	PARAM_CARRY, %ecx
     59 	jmp	L(start_1c)
     60 
     61 EPILOGUE()
     62 
     63 
     64 	ALIGN(16)
     65 PROLOGUE(mpn_modexact_1_odd)
     66 deflit(`FRAME',0)
     67 
     68 	xorl	%ecx, %ecx
     69 L(start_1c):
     70 	movl	PARAM_DIVISOR, %eax
     71 	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
     72 
     73 	movl	%esi, SAVE_ESI
     74 	movl	PARAM_DIVISOR, %esi
     75 
     76 	movl	%edi, SAVE_EDI
     77 
     78 	shrl	%eax			C d/2
     79 
     80 	andl	$127, %eax
     81 
     82 ifdef(`PIC',`
     83 	LEA(	binvert_limb_table, %edi)
     84 	movzbl	(%eax,%edi), %edi		C inv 8 bits
     85 ',`
     86 	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
     87 ')
     88 
     89 	xorl	%edx, %edx		C initial extra carry
     90 	leal	(%edi,%edi), %eax	C 2*inv
     91 
     92 	imull	%edi, %edi		C inv*inv
     93 
     94 	movl	%ebp, SAVE_EBP
     95 	movl	PARAM_SIZE, %ebp
     96 
     97 	movl	%ebx, SAVE_EBX
     98 	movl	PARAM_SRC, %ebx
     99 
    100 	imull	%esi, %edi		C inv*inv*d
    101 
    102 	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
    103 	leal	(%eax,%eax), %edi	C 2*inv
    104 
    105 	imull	%eax, %eax		C inv*inv
    106 
    107 	imull	%esi, %eax		C inv*inv*d
    108 
    109 	leal	(%ebx,%ebp,4), %ebx	C src end
    110 	negl	%ebp			C -size
    111 
    112 	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
    113 
    114 	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
    115 	movl	%esi, %eax
    116 	imull	%edi, %eax
    117 	cmpl	$1, %eax')
    118 
    119 
    120 C The dependent chain here is
    121 C
    122 C                            cycles
    123 C	subl	%edx, %eax	1
    124 C	imull	%edi, %eax	4
    125 C	mull	%esi		6  (high limb)
    126 C			      ----
    127 C       total		       11
    128 C
    129 C Out of order execution hides the load latency for the source data, so no
    130 C special scheduling is required.
    131 
    132 L(top):
    133 	C eax	src limb
    134 	C ebx	src end ptr
    135 	C ecx	next carry bit, 0 or 1 (or initial carry param)
    136 	C edx	carry limb, high of last product
    137 	C esi	divisor
    138 	C edi	inverse
    139 	C ebp	counter, limbs, negative
    140 
    141 	movl	(%ebx,%ebp,4), %eax
    142 
    143 	subl	%ecx, %eax		C apply carry bit
    144 	movl	$0, %ecx
    145 
    146 	setc	%cl			C new carry bit
    147 
    148 	subl	%edx, %eax		C apply carry limb
    149 	adcl	$0, %ecx
    150 
    151 	imull	%edi, %eax
    152 
    153 	mull	%esi
    154 
    155 	incl	%ebp
    156 	jnz	L(top)
    157 
    158 
    159 	movl	SAVE_ESI, %esi
    160 	movl	SAVE_EDI, %edi
    161 	leal	(%ecx,%edx), %eax
    162 
    163 	movl	SAVE_EBX, %ebx
    164 	movl	SAVE_EBP, %ebp
    165 	addl	$STACK_SPACE, %esp
    166 
    167 	ret
    168 
    169 EPILOGUE()
    170