Home | History | Annotate | Line # | Download | only in k7
mode1o.asm revision 1.1.1.2
      1 dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
      2 
      3 dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C          cycles/limb
     35 C Athlon:     11.0
     36 C Hammer:      7.0
     37 
     38 
     39 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     40 C                               mp_limb_t divisor);
     41 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     42 C                                mp_limb_t divisor, mp_limb_t carry);
     43 C
     44 C With the loop running at just 11 cycles it doesn't seem worth bothering to
     45 C check for high<divisor to save one step.
     46 C
     47 C Using a divl for size==1 measures slower than the modexact method, which
     48 C is not too surprising since for the latter it's only about 24 cycles to
     49 C calculate the modular inverse.
     50 
     51 defframe(PARAM_CARRY,  16)
     52 defframe(PARAM_DIVISOR,12)
     53 defframe(PARAM_SIZE,   8)
     54 defframe(PARAM_SRC,    4)
     55 
     56 defframe(SAVE_EBX,     -4)
     57 defframe(SAVE_ESI,     -8)
     58 defframe(SAVE_EDI,    -12)
     59 defframe(SAVE_EBP,    -16)
     60 
     61 deflit(STACK_SPACE, 16)
     62 
     63 	TEXT
     64 
     65 	ALIGN(16)
     66 PROLOGUE(mpn_modexact_1c_odd)
     67 deflit(`FRAME',0)
     68 
     69 	movl	PARAM_CARRY, %ecx
     70 	jmp	L(start_1c)
     71 
     72 EPILOGUE()
     73 
     74 
     75 	ALIGN(16)
     76 PROLOGUE(mpn_modexact_1_odd)
     77 deflit(`FRAME',0)
     78 
     79 	xorl	%ecx, %ecx
     80 L(start_1c):
     81 	movl	PARAM_DIVISOR, %eax
     82 	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
     83 
     84 	movl	%esi, SAVE_ESI
     85 	movl	PARAM_DIVISOR, %esi
     86 
     87 	movl	%edi, SAVE_EDI
     88 
     89 	shrl	%eax			C d/2
     90 
     91 	andl	$127, %eax
     92 
     93 ifdef(`PIC',`
     94 	LEA(	binvert_limb_table, %edi)
     95 	movzbl	(%eax,%edi), %edi		C inv 8 bits
     96 ',`
     97 	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
     98 ')
     99 
    100 	xorl	%edx, %edx		C initial extra carry
    101 	leal	(%edi,%edi), %eax	C 2*inv
    102 
    103 	imull	%edi, %edi		C inv*inv
    104 
    105 	movl	%ebp, SAVE_EBP
    106 	movl	PARAM_SIZE, %ebp
    107 
    108 	movl	%ebx, SAVE_EBX
    109 	movl	PARAM_SRC, %ebx
    110 
    111 	imull	%esi, %edi		C inv*inv*d
    112 
    113 	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
    114 	leal	(%eax,%eax), %edi	C 2*inv
    115 
    116 	imull	%eax, %eax		C inv*inv
    117 
    118 	imull	%esi, %eax		C inv*inv*d
    119 
    120 	leal	(%ebx,%ebp,4), %ebx	C src end
    121 	negl	%ebp			C -size
    122 
    123 	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
    124 
    125 	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
    126 	movl	%esi, %eax
    127 	imull	%edi, %eax
    128 	cmpl	$1, %eax')
    129 
    130 
    131 C The dependent chain here is
    132 C
    133 C                            cycles
    134 C	subl	%edx, %eax	1
    135 C	imull	%edi, %eax	4
    136 C	mull	%esi		6  (high limb)
    137 C			      ----
    138 C       total		       11
    139 C
    140 C Out of order execution hides the load latency for the source data, so no
    141 C special scheduling is required.
    142 
    143 L(top):
    144 	C eax	src limb
    145 	C ebx	src end ptr
    146 	C ecx	next carry bit, 0 or 1 (or initial carry param)
    147 	C edx	carry limb, high of last product
    148 	C esi	divisor
    149 	C edi	inverse
    150 	C ebp	counter, limbs, negative
    151 
    152 	movl	(%ebx,%ebp,4), %eax
    153 
    154 	subl	%ecx, %eax		C apply carry bit
    155 	movl	$0, %ecx
    156 
    157 	setc	%cl			C new carry bit
    158 
    159 	subl	%edx, %eax		C apply carry limb
    160 	adcl	$0, %ecx
    161 
    162 	imull	%edi, %eax
    163 
    164 	mull	%esi
    165 
    166 	incl	%ebp
    167 	jnz	L(top)
    168 
    169 
    170 	movl	SAVE_ESI, %esi
    171 	movl	SAVE_EDI, %edi
    172 	leal	(%ecx,%edx), %eax
    173 
    174 	movl	SAVE_EBX, %ebx
    175 	movl	SAVE_EBP, %ebp
    176 	addl	$STACK_SPACE, %esp
    177 
    178 	ret
    179 
    180 EPILOGUE()
    181 ASM_END()
    182