Home | History | Annotate | Line # | Download | only in k7
      1      1.1  mrg dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C          cycles/limb
     35      1.1  mrg C Athlon:     11.0
     36      1.1  mrg C Hammer:      7.0
     37      1.1  mrg 
     38      1.1  mrg 
     39      1.1  mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     40      1.1  mrg C                               mp_limb_t divisor);
     41      1.1  mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     42      1.1  mrg C                                mp_limb_t divisor, mp_limb_t carry);
     43      1.1  mrg C
     44      1.1  mrg C With the loop running at just 11 cycles it doesn't seem worth bothering to
     45      1.1  mrg C check for high<divisor to save one step.
     46      1.1  mrg C
     47      1.1  mrg C Using a divl for size==1 measures slower than the modexact method, which
     48      1.1  mrg C is not too surprising since for the latter it's only about 24 cycles to
     49      1.1  mrg C calculate the modular inverse.
     50      1.1  mrg 
     51      1.1  mrg defframe(PARAM_CARRY,  16)
     52      1.1  mrg defframe(PARAM_DIVISOR,12)
     53      1.1  mrg defframe(PARAM_SIZE,   8)
     54      1.1  mrg defframe(PARAM_SRC,    4)
     55      1.1  mrg 
     56      1.1  mrg defframe(SAVE_EBX,     -4)
     57      1.1  mrg defframe(SAVE_ESI,     -8)
     58      1.1  mrg defframe(SAVE_EDI,    -12)
     59      1.1  mrg defframe(SAVE_EBP,    -16)
     60      1.1  mrg 
     61      1.1  mrg deflit(STACK_SPACE, 16)
     62      1.1  mrg 
     63      1.1  mrg 	TEXT
     64      1.1  mrg 
     65      1.1  mrg 	ALIGN(16)
     66      1.1  mrg PROLOGUE(mpn_modexact_1c_odd)
     67      1.1  mrg deflit(`FRAME',0)
     68      1.1  mrg 
     69      1.1  mrg 	movl	PARAM_CARRY, %ecx
     70      1.1  mrg 	jmp	L(start_1c)
     71      1.1  mrg 
     72      1.1  mrg EPILOGUE()
     73      1.1  mrg 
     74      1.1  mrg 
     75      1.1  mrg 	ALIGN(16)
     76      1.1  mrg PROLOGUE(mpn_modexact_1_odd)
     77      1.1  mrg deflit(`FRAME',0)
     78      1.1  mrg 
     79      1.1  mrg 	xorl	%ecx, %ecx
     80      1.1  mrg L(start_1c):
     81      1.1  mrg 	movl	PARAM_DIVISOR, %eax
     82      1.1  mrg 	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
     83      1.1  mrg 
     84      1.1  mrg 	movl	%esi, SAVE_ESI
     85      1.1  mrg 	movl	PARAM_DIVISOR, %esi
     86      1.1  mrg 
     87      1.1  mrg 	movl	%edi, SAVE_EDI
     88      1.1  mrg 
     89      1.1  mrg 	shrl	%eax			C d/2
     90      1.1  mrg 
     91      1.1  mrg 	andl	$127, %eax
     92      1.1  mrg 
     93      1.1  mrg ifdef(`PIC',`
     94      1.1  mrg 	LEA(	binvert_limb_table, %edi)
     95      1.1  mrg 	movzbl	(%eax,%edi), %edi		C inv 8 bits
     96      1.1  mrg ',`
     97      1.1  mrg 	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
     98      1.1  mrg ')
     99      1.1  mrg 
    100      1.1  mrg 	xorl	%edx, %edx		C initial extra carry
    101      1.1  mrg 	leal	(%edi,%edi), %eax	C 2*inv
    102      1.1  mrg 
    103      1.1  mrg 	imull	%edi, %edi		C inv*inv
    104      1.1  mrg 
    105      1.1  mrg 	movl	%ebp, SAVE_EBP
    106      1.1  mrg 	movl	PARAM_SIZE, %ebp
    107      1.1  mrg 
    108      1.1  mrg 	movl	%ebx, SAVE_EBX
    109      1.1  mrg 	movl	PARAM_SRC, %ebx
    110      1.1  mrg 
    111      1.1  mrg 	imull	%esi, %edi		C inv*inv*d
    112      1.1  mrg 
    113      1.1  mrg 	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
    114      1.1  mrg 	leal	(%eax,%eax), %edi	C 2*inv
    115      1.1  mrg 
    116      1.1  mrg 	imull	%eax, %eax		C inv*inv
    117      1.1  mrg 
    118      1.1  mrg 	imull	%esi, %eax		C inv*inv*d
    119      1.1  mrg 
    120      1.1  mrg 	leal	(%ebx,%ebp,4), %ebx	C src end
    121      1.1  mrg 	negl	%ebp			C -size
    122      1.1  mrg 
    123      1.1  mrg 	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
    124      1.1  mrg 
    125      1.1  mrg 	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
    126      1.1  mrg 	movl	%esi, %eax
    127      1.1  mrg 	imull	%edi, %eax
    128      1.1  mrg 	cmpl	$1, %eax')
    129      1.1  mrg 
    130      1.1  mrg 
    131      1.1  mrg C The dependent chain here is
    132      1.1  mrg C
    133      1.1  mrg C                            cycles
    134      1.1  mrg C	subl	%edx, %eax	1
    135      1.1  mrg C	imull	%edi, %eax	4
    136      1.1  mrg C	mull	%esi		6  (high limb)
    137      1.1  mrg C			      ----
    138      1.1  mrg C       total		       11
    139      1.1  mrg C
    140      1.1  mrg C Out of order execution hides the load latency for the source data, so no
    141      1.1  mrg C special scheduling is required.
    142      1.1  mrg 
    143      1.1  mrg L(top):
    144      1.1  mrg 	C eax	src limb
    145      1.1  mrg 	C ebx	src end ptr
    146      1.1  mrg 	C ecx	next carry bit, 0 or 1 (or initial carry param)
    147      1.1  mrg 	C edx	carry limb, high of last product
    148      1.1  mrg 	C esi	divisor
    149      1.1  mrg 	C edi	inverse
    150      1.1  mrg 	C ebp	counter, limbs, negative
    151      1.1  mrg 
    152      1.1  mrg 	movl	(%ebx,%ebp,4), %eax
    153      1.1  mrg 
    154      1.1  mrg 	subl	%ecx, %eax		C apply carry bit
    155      1.1  mrg 	movl	$0, %ecx
    156      1.1  mrg 
    157      1.1  mrg 	setc	%cl			C new carry bit
    158      1.1  mrg 
    159      1.1  mrg 	subl	%edx, %eax		C apply carry limb
    160      1.1  mrg 	adcl	$0, %ecx
    161      1.1  mrg 
    162      1.1  mrg 	imull	%edi, %eax
    163      1.1  mrg 
    164      1.1  mrg 	mull	%esi
    165      1.1  mrg 
    166      1.1  mrg 	incl	%ebp
    167      1.1  mrg 	jnz	L(top)
    168      1.1  mrg 
    169      1.1  mrg 
    170      1.1  mrg 	movl	SAVE_ESI, %esi
    171      1.1  mrg 	movl	SAVE_EDI, %edi
    172      1.1  mrg 	leal	(%ecx,%edx), %eax
    173      1.1  mrg 
    174      1.1  mrg 	movl	SAVE_EBX, %ebx
    175      1.1  mrg 	movl	SAVE_EBP, %ebp
    176      1.1  mrg 	addl	$STACK_SPACE, %esp
    177      1.1  mrg 
    178      1.1  mrg 	ret
    179      1.1  mrg 
    180      1.1  mrg EPILOGUE()
    181  1.1.1.2  mrg ASM_END()
    182