Home | History | Annotate | Line # | Download | only in p6
      1  1.1  mrg dnl  x86 mpn_gcd_11 optimised for processors with fast BSF.
      2  1.1  mrg 
      3  1.1  mrg dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked by Torbjorn Granlund.
      4  1.1  mrg 
      5  1.1  mrg dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
      6  1.1  mrg dnl  Foundation, Inc.
      7  1.1  mrg 
      8  1.1  mrg dnl  This file is part of the GNU MP Library.
      9  1.1  mrg dnl
     10  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     11  1.1  mrg dnl  it under the terms of either:
     12  1.1  mrg dnl
     13  1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     14  1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     15  1.1  mrg dnl      option) any later version.
     16  1.1  mrg dnl
     17  1.1  mrg dnl  or
     18  1.1  mrg dnl
     19  1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     20  1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     21  1.1  mrg dnl      later version.
     22  1.1  mrg dnl
     23  1.1  mrg dnl  or both in parallel, as here.
     24  1.1  mrg dnl
     25  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     26  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     27  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     28  1.1  mrg dnl  for more details.
     29  1.1  mrg dnl
     30  1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     31  1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     32  1.1  mrg dnl  see https://www.gnu.org/licenses/.
     33  1.1  mrg 
     34  1.1  mrg include(`../config.m4')
     35  1.1  mrg 
     36  1.1  mrg 
     37  1.1  mrg C	     cycles/bit (approx)
     38  1.1  mrg C AMD K7	 7.80
     39  1.1  mrg C AMD K8,K9	 7.79
     40  1.1  mrg C AMD K10	 4.08
     41  1.1  mrg C AMD bd1	 ?
     42  1.1  mrg C AMD bobcat	 7.82
     43  1.1  mrg C Intel P4-2	14.9
     44  1.1  mrg C Intel P4-3/4	14.0
     45  1.1  mrg C Intel P6/13	 5.09
     46  1.1  mrg C Intel core2	 4.22
     47  1.1  mrg C Intel NHM	 5.00
     48  1.1  mrg C Intel SBR	 5.00
     49  1.1  mrg C Intel atom	17.1
     50  1.1  mrg C VIA nano	?
     51  1.1  mrg C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
     52  1.1  mrg 
     53  1.1  mrg 
     54  1.1  mrg define(`u0',    `%eax')
     55  1.1  mrg define(`v0',    `%edx')
     56  1.1  mrg 
     57  1.1  mrg ASM_START()
     58  1.1  mrg 	TEXT
     59  1.1  mrg 	ALIGN(16)
     60  1.1  mrg PROLOGUE(mpn_gcd_11)
     61  1.1  mrg 	push	%edi
     62  1.1  mrg 	push	%esi
     63  1.1  mrg 
     64  1.1  mrg 	mov	12(%esp), %eax
     65  1.1  mrg 	mov	16(%esp), %edx
     66  1.1  mrg 	jmp	L(odd)
     67  1.1  mrg 
     68  1.1  mrg 	ALIGN(16)		C               K10   BD    C2    NHM   SBR
     69  1.1  mrg L(top):	cmovc(	%esi, %eax)	C u = |v - u|   0,3   0,3   0,6   0,5   0,5
     70  1.1  mrg 	cmovc(	%edi, %edx)	C v = min(u,v)  0,3   0,3   2,8   1,7   1,7
     71  1.1  mrg 	shr	%cl, %eax	C               1,7   1,6   2,8   2,8   2,8
     72  1.1  mrg L(odd):	mov	%edx, %esi	C               1     1     4     3     3
     73  1.1  mrg 	sub	%eax, %esi	C               2     2     5     4     4
     74  1.1  mrg 	bsf	%esi, %ecx	C               3     3     6     5     5
     75  1.1  mrg 	mov	%eax, %edi	C               2     2     3     3     4
     76  1.1  mrg 	sub	%edx, %eax	C               2     2     4     3     4
     77  1.1  mrg 	jnz	L(top)		C
     78  1.1  mrg 
     79  1.1  mrg L(end):	mov	%edx, %eax
     80  1.1  mrg 	pop	%esi
     81  1.1  mrg 	pop	%edi
     82  1.1  mrg 	ret
     83  1.1  mrg EPILOGUE()
     84