Home | History | Annotate | Line # | Download | only in k6
      1      1.1  mrg dnl  AMD K6 mpn_modexact_1_odd -- exact division style remainder.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Copyright 2000-2003, 2007 Free Software Foundation, Inc.
      4  1.1.1.2  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6      1.1  mrg dnl
      7  1.1.1.2  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23  1.1.1.2  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26      1.1  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg include(`../config.m4')
     32      1.1  mrg 
     33      1.1  mrg 
     34      1.1  mrg C K6: 10.0 cycles/limb
     35      1.1  mrg 
     36      1.1  mrg 
     37      1.1  mrg C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
     38      1.1  mrg C                               mp_limb_t divisor);
     39      1.1  mrg C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
     40      1.1  mrg C                                mp_limb_t divisor, mp_limb_t carry);
     41      1.1  mrg C
     42      1.1  mrg C A special case for high<divisor at the end measured only about 4 cycles
     43      1.1  mrg C faster, and so isn't used.
     44      1.1  mrg C
     45      1.1  mrg C A special case for size==1 using a divl rather than the inverse measured
     46      1.1  mrg C only about 5 cycles faster, and so isn't used.  When size==1 and
     47      1.1  mrg C high<divisor it can skip a division and be a full 24 cycles faster, but
     48      1.1  mrg C this isn't an important case.
     49      1.1  mrg 
     50      1.1  mrg defframe(PARAM_CARRY,  16)
     51      1.1  mrg defframe(PARAM_DIVISOR,12)
     52      1.1  mrg defframe(PARAM_SIZE,   8)
     53      1.1  mrg defframe(PARAM_SRC,    4)
     54      1.1  mrg 
     55      1.1  mrg 	TEXT
     56      1.1  mrg 
     57      1.1  mrg 	ALIGN(32)
     58      1.1  mrg PROLOGUE(mpn_modexact_1c_odd)
     59      1.1  mrg deflit(`FRAME',0)
     60      1.1  mrg 
     61      1.1  mrg 	movl	PARAM_DIVISOR, %ecx
     62      1.1  mrg 	pushl	%esi		FRAME_pushl()
     63      1.1  mrg 
     64      1.1  mrg 	movl	PARAM_CARRY, %edx
     65      1.1  mrg 	jmp	L(start_1c)
     66      1.1  mrg 
     67      1.1  mrg EPILOGUE()
     68      1.1  mrg 
     69      1.1  mrg 
     70      1.1  mrg 	ALIGN(16)
     71      1.1  mrg PROLOGUE(mpn_modexact_1_odd)
     72      1.1  mrg deflit(`FRAME',0)
     73      1.1  mrg 
     74      1.1  mrg 	movl	PARAM_DIVISOR, %ecx
     75      1.1  mrg 	pushl	%esi		FRAME_pushl()
     76      1.1  mrg 
     77      1.1  mrg 	xorl	%edx, %edx
     78      1.1  mrg L(start_1c):
     79      1.1  mrg 	pushl	%edi		FRAME_pushl()
     80      1.1  mrg 
     81      1.1  mrg 	shrl	%ecx			C d/2
     82      1.1  mrg 	movl	PARAM_DIVISOR, %esi
     83      1.1  mrg 
     84      1.1  mrg 	andl	$127, %ecx		C d/2, 7 bits
     85      1.1  mrg 	pushl	%ebp		FRAME_pushl()
     86      1.1  mrg 
     87      1.1  mrg ifdef(`PIC',`
     88      1.1  mrg 	LEA(	binvert_limb_table, %edi)
     89      1.1  mrg Zdisp(	movzbl,	0,(%ecx,%edi), %edi)		C inv 8 bits
     90      1.1  mrg ',`
     91      1.1  mrg 	movzbl	binvert_limb_table(%ecx), %edi	C inv 8 bits
     92      1.1  mrg ')
     93      1.1  mrg 	leal	(%edi,%edi), %ecx	C 2*inv
     94      1.1  mrg 
     95      1.1  mrg 	imull	%edi, %edi		C inv*inv
     96      1.1  mrg 
     97      1.1  mrg 	movl	PARAM_SRC, %eax
     98      1.1  mrg 	movl	PARAM_SIZE, %ebp
     99      1.1  mrg 
    100      1.1  mrg 	imull	%esi, %edi		C inv*inv*d
    101      1.1  mrg 
    102      1.1  mrg 	pushl	%ebx		FRAME_pushl()
    103      1.1  mrg 	leal	(%eax,%ebp,4), %ebx	C src end
    104      1.1  mrg 
    105      1.1  mrg 	subl	%edi, %ecx		C inv = 2*inv - inv*inv*d
    106      1.1  mrg 	leal	(%ecx,%ecx), %edi	C 2*inv
    107      1.1  mrg 
    108      1.1  mrg 	imull	%ecx, %ecx		C inv*inv
    109      1.1  mrg 
    110      1.1  mrg 	movl	(%eax), %eax		C src low limb
    111      1.1  mrg 	negl	%ebp			C -size
    112      1.1  mrg 
    113      1.1  mrg 	imull	%esi, %ecx		C inv*inv*d
    114      1.1  mrg 
    115      1.1  mrg 	subl	%ecx, %edi		C inv = 2*inv - inv*inv*d
    116      1.1  mrg 
    117      1.1  mrg 	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
    118      1.1  mrg 	pushl	%eax
    119      1.1  mrg 	movl	%esi, %eax
    120      1.1  mrg 	imull	%edi, %eax
    121      1.1  mrg 	cmpl	$1, %eax
    122      1.1  mrg 	popl	%eax')
    123      1.1  mrg 
    124      1.1  mrg 	jmp	L(entry)
    125      1.1  mrg 
    126      1.1  mrg 
    127      1.1  mrg C Rotating the mul to the top of the loop saves 1 cycle, presumably by
    128      1.1  mrg C hiding the loop control under the imul latency.
    129      1.1  mrg C
    130      1.1  mrg C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
    131      1.1  mrg C only 8).  It's not clear how to get down to 9 cycles.
    132      1.1  mrg C
    133      1.1  mrg C The xor and rcl to handle the carry bit could be an sbb instead, with the
    134      1.1  mrg C the carry bit add becoming a sub, but that doesn't save anything.
    135      1.1  mrg 
    136      1.1  mrg L(top):
    137      1.1  mrg 	C eax	(low product)
    138      1.1  mrg 	C ebx	src end
    139      1.1  mrg 	C ecx	carry bit, 0 or 1
    140      1.1  mrg 	C edx	(high product, being carry limb)
    141      1.1  mrg 	C esi	divisor
    142      1.1  mrg 	C edi	inverse
    143      1.1  mrg 	C ebp	counter, limbs, negative
    144      1.1  mrg 
    145      1.1  mrg 	mull	%esi
    146      1.1  mrg 
    147      1.1  mrg 	movl	(%ebx,%ebp,4), %eax
    148      1.1  mrg 	addl	%ecx, %edx		C apply carry bit to carry limb
    149      1.1  mrg 
    150      1.1  mrg L(entry):
    151      1.1  mrg 	xorl	%ecx, %ecx
    152      1.1  mrg 	subl	%edx, %eax		C apply carry limb
    153      1.1  mrg 
    154      1.1  mrg 	rcll	%ecx
    155      1.1  mrg 
    156      1.1  mrg 	imull	%edi, %eax
    157      1.1  mrg 
    158      1.1  mrg 	incl	%ebp
    159      1.1  mrg 	jnz	L(top)
    160      1.1  mrg 
    161      1.1  mrg 
    162      1.1  mrg 
    163      1.1  mrg 	popl	%ebx
    164      1.1  mrg 	popl	%ebp
    165      1.1  mrg 
    166      1.1  mrg 	mull	%esi
    167      1.1  mrg 
    168      1.1  mrg 	popl	%edi
    169      1.1  mrg 	popl	%esi
    170      1.1  mrg 
    171      1.1  mrg 	leal	(%ecx,%edx), %eax
    172      1.1  mrg 
    173      1.1  mrg 	ret
    174      1.1  mrg 
    175      1.1  mrg EPILOGUE()
    176  1.1.1.2  mrg ASM_END()
    177