Home | History | Annotate | Line # | Download | only in k7
mode1o.asm revision 1.1
      1  1.1  mrg dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
      2  1.1  mrg 
      3  1.1  mrg dnl  Copyright 2000, 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
      4  1.1  mrg dnl
      5  1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1  mrg dnl
      7  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or
      8  1.1  mrg dnl  modify it under the terms of the GNU Lesser General Public License as
      9  1.1  mrg dnl  published by the Free Software Foundation; either version 3 of the
     10  1.1  mrg dnl  License, or (at your option) any later version.
     11  1.1  mrg dnl
     12  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful,
     13  1.1  mrg dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
     14  1.1  mrg dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15  1.1  mrg dnl  Lesser General Public License for more details.
     16  1.1  mrg dnl
     17  1.1  mrg dnl  You should have received a copy of the GNU Lesser General Public License
     18  1.1  mrg dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     19  1.1  mrg 
     20  1.1  mrg include(`../config.m4')
     21  1.1  mrg 
     22  1.1  mrg 
     23  1.1  mrg C          cycles/limb
     24  1.1  mrg C Athlon:     11.0
     25  1.1  mrg C Hammer:      7.0
     26  1.1  mrg 
     27  1.1  mrg 
     28  1.1  mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     29  1.1  mrg C                               mp_limb_t divisor);
     30  1.1  mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     31  1.1  mrg C                                mp_limb_t divisor, mp_limb_t carry);
     32  1.1  mrg C
     33  1.1  mrg C With the loop running at just 11 cycles it doesn't seem worth bothering to
     34  1.1  mrg C check for high<divisor to save one step.
     35  1.1  mrg C
     36  1.1  mrg C Using a divl for size==1 measures slower than the modexact method, which
     37  1.1  mrg C is not too surprising since for the latter it's only about 24 cycles to
     38  1.1  mrg C calculate the modular inverse.
     39  1.1  mrg 
     40  1.1  mrg defframe(PARAM_CARRY,  16)
     41  1.1  mrg defframe(PARAM_DIVISOR,12)
     42  1.1  mrg defframe(PARAM_SIZE,   8)
     43  1.1  mrg defframe(PARAM_SRC,    4)
     44  1.1  mrg 
     45  1.1  mrg defframe(SAVE_EBX,     -4)
     46  1.1  mrg defframe(SAVE_ESI,     -8)
     47  1.1  mrg defframe(SAVE_EDI,    -12)
     48  1.1  mrg defframe(SAVE_EBP,    -16)
     49  1.1  mrg 
     50  1.1  mrg deflit(STACK_SPACE, 16)
     51  1.1  mrg 
     52  1.1  mrg 	TEXT
     53  1.1  mrg 
     54  1.1  mrg 	ALIGN(16)
     55  1.1  mrg PROLOGUE(mpn_modexact_1c_odd)
     56  1.1  mrg deflit(`FRAME',0)
     57  1.1  mrg 
     58  1.1  mrg 	movl	PARAM_CARRY, %ecx
     59  1.1  mrg 	jmp	L(start_1c)
     60  1.1  mrg 
     61  1.1  mrg EPILOGUE()
     62  1.1  mrg 
     63  1.1  mrg 
     64  1.1  mrg 	ALIGN(16)
     65  1.1  mrg PROLOGUE(mpn_modexact_1_odd)
     66  1.1  mrg deflit(`FRAME',0)
     67  1.1  mrg 
     68  1.1  mrg 	xorl	%ecx, %ecx
     69  1.1  mrg L(start_1c):
     70  1.1  mrg 	movl	PARAM_DIVISOR, %eax
     71  1.1  mrg 	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
     72  1.1  mrg 
     73  1.1  mrg 	movl	%esi, SAVE_ESI
     74  1.1  mrg 	movl	PARAM_DIVISOR, %esi
     75  1.1  mrg 
     76  1.1  mrg 	movl	%edi, SAVE_EDI
     77  1.1  mrg 
     78  1.1  mrg 	shrl	%eax			C d/2
     79  1.1  mrg 
     80  1.1  mrg 	andl	$127, %eax
     81  1.1  mrg 
     82  1.1  mrg ifdef(`PIC',`
     83  1.1  mrg 	LEA(	binvert_limb_table, %edi)
     84  1.1  mrg 	movzbl	(%eax,%edi), %edi		C inv 8 bits
     85  1.1  mrg ',`
     86  1.1  mrg 	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
     87  1.1  mrg ')
     88  1.1  mrg 
     89  1.1  mrg 	xorl	%edx, %edx		C initial extra carry
     90  1.1  mrg 	leal	(%edi,%edi), %eax	C 2*inv
     91  1.1  mrg 
     92  1.1  mrg 	imull	%edi, %edi		C inv*inv
     93  1.1  mrg 
     94  1.1  mrg 	movl	%ebp, SAVE_EBP
     95  1.1  mrg 	movl	PARAM_SIZE, %ebp
     96  1.1  mrg 
     97  1.1  mrg 	movl	%ebx, SAVE_EBX
     98  1.1  mrg 	movl	PARAM_SRC, %ebx
     99  1.1  mrg 
    100  1.1  mrg 	imull	%esi, %edi		C inv*inv*d
    101  1.1  mrg 
    102  1.1  mrg 	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
    103  1.1  mrg 	leal	(%eax,%eax), %edi	C 2*inv
    104  1.1  mrg 
    105  1.1  mrg 	imull	%eax, %eax		C inv*inv
    106  1.1  mrg 
    107  1.1  mrg 	imull	%esi, %eax		C inv*inv*d
    108  1.1  mrg 
    109  1.1  mrg 	leal	(%ebx,%ebp,4), %ebx	C src end
    110  1.1  mrg 	negl	%ebp			C -size
    111  1.1  mrg 
    112  1.1  mrg 	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
    113  1.1  mrg 
    114  1.1  mrg 	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
    115  1.1  mrg 	movl	%esi, %eax
    116  1.1  mrg 	imull	%edi, %eax
    117  1.1  mrg 	cmpl	$1, %eax')
    118  1.1  mrg 
    119  1.1  mrg 
    120  1.1  mrg C The dependent chain here is
    121  1.1  mrg C
    122  1.1  mrg C                            cycles
    123  1.1  mrg C	subl	%edx, %eax	1
    124  1.1  mrg C	imull	%edi, %eax	4
    125  1.1  mrg C	mull	%esi		6  (high limb)
    126  1.1  mrg C			      ----
    127  1.1  mrg C       total		       11
    128  1.1  mrg C
    129  1.1  mrg C Out of order execution hides the load latency for the source data, so no
    130  1.1  mrg C special scheduling is required.
    131  1.1  mrg 
    132  1.1  mrg L(top):
    133  1.1  mrg 	C eax	src limb
    134  1.1  mrg 	C ebx	src end ptr
    135  1.1  mrg 	C ecx	next carry bit, 0 or 1 (or initial carry param)
    136  1.1  mrg 	C edx	carry limb, high of last product
    137  1.1  mrg 	C esi	divisor
    138  1.1  mrg 	C edi	inverse
    139  1.1  mrg 	C ebp	counter, limbs, negative
    140  1.1  mrg 
    141  1.1  mrg 	movl	(%ebx,%ebp,4), %eax
    142  1.1  mrg 
    143  1.1  mrg 	subl	%ecx, %eax		C apply carry bit
    144  1.1  mrg 	movl	$0, %ecx
    145  1.1  mrg 
    146  1.1  mrg 	setc	%cl			C new carry bit
    147  1.1  mrg 
    148  1.1  mrg 	subl	%edx, %eax		C apply carry limb
    149  1.1  mrg 	adcl	$0, %ecx
    150  1.1  mrg 
    151  1.1  mrg 	imull	%edi, %eax
    152  1.1  mrg 
    153  1.1  mrg 	mull	%esi
    154  1.1  mrg 
    155  1.1  mrg 	incl	%ebp
    156  1.1  mrg 	jnz	L(top)
    157  1.1  mrg 
    158  1.1  mrg 
    159  1.1  mrg 	movl	SAVE_ESI, %esi
    160  1.1  mrg 	movl	SAVE_EDI, %edi
    161  1.1  mrg 	leal	(%ecx,%edx), %eax
    162  1.1  mrg 
    163  1.1  mrg 	movl	SAVE_EBX, %ebx
    164  1.1  mrg 	movl	SAVE_EBP, %ebp
    165  1.1  mrg 	addl	$STACK_SPACE, %esp
    166  1.1  mrg 
    167  1.1  mrg 	ret
    168  1.1  mrg 
    169  1.1  mrg EPILOGUE()
    170