Home | History | Annotate | Line # | Download | only in bd2
      1 dnl  AMD64 mpn_gcd_22.  Assumes useless bsf, useless shrd, tzcnt, no shlx.
      2 
      3 dnl  Copyright 2019 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C	     cycles/bit
     35 C AMD K8,K9	12.3
     36 C AMD K10	 8.0
     37 C AMD bd1	10.0
     38 C AMD bd2	 7.2
     39 C AMD bd3	 ?
     40 C AMD bd4	 6.7
     41 C AMD bt1	13.6
     42 C AMD bt2	 8.9
     43 C AMD zn1	 5.7
     44 C AMD zn2	 5.6
     45 C Intel P4	 ?
     46 C Intel CNR	 9.7
     47 C Intel PNR	 9.7
     48 C Intel NHM	 9.4
     49 C Intel WSM	 9.5
     50 C Intel SBR	10.3
     51 C Intel IBR	 ?
     52 C Intel HWL	 8.2
     53 C Intel BWL	 7.4
     54 C Intel SKL	 7.3
     55 C Intel atom	26.5
     56 C Intel SLM	17.4
     57 C Intel GLM	13.4
     58 C Intel GLM+	12.4
     59 C VIA nano	 ?
     60 
     61 
     62 define(`u1',    `%rdi')
     63 define(`u0',    `%rsi')
     64 define(`v1',    `%rdx')
     65 define(`v0_param', `%rcx')
     66 
     67 define(`v0',    `%rax')
     68 define(`cnt',   `%rcx')
     69 
     70 define(`s0',    `%r8')
     71 define(`s1',    `%r9')
     72 define(`t0',    `%r10')
     73 define(`t1',    `%r11')
     74 
     75 dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
     76 ABI_SUPPORT(STD64)
     77 
     78 ASM_START()
     79 	TEXT
     80 	ALIGN(64)
     81 PROLOGUE(mpn_gcd_22)
     82 	FUNC_ENTRY(4)
     83 	mov	v0_param, v0
     84 
     85 	ALIGN(16)
     86 L(top):	mov	v0, t0
     87 	sub	u0, t0
     88 	jz	L(lowz)		C	jump when low limb result = 0
     89 	mov	v1, t1
     90 	sbb	u1, t1
     91 
     92 	rep;bsf	t0, cnt		C tzcnt!
     93 	mov	u0, s0
     94 	mov	u1, s1
     95 
     96 	sub	v0, u0
     97 	sbb	v1, u1
     98 
     99 L(bck):	cmovc	t0, u0		C u = |u - v|
    100 	cmovc	t1, u1		C u = |u - v|
    101 	cmovc	s0, v0		C v = min(u,v)
    102 	cmovc	s1, v1		C v = min(u,v)
    103 
    104 C Rightshift (u1,,u0) into (u1,,u0)
    105 L(shr):	shr	R8(cnt), u0
    106 	mov	u1, t1
    107 	shr	R8(cnt), u1
    108 	neg	cnt
    109 	shl	R8(cnt), t1
    110 	or	t1, u0
    111 
    112 	test	v1, v1
    113 	jnz	L(top)
    114 	test	u1, u1
    115 	jnz	L(top)
    116 
    117 L(gcd_11):
    118 	mov	v0, %rdi
    119 C	mov	u0, %rsi
    120 	TCALL(	mpn_gcd_11)
    121 
    122 L(lowz):C We come here when v0 - u0 = 0
    123 	C 1. If v1 - u1 = 0, then gcd is u = v.
    124 	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
    125 	mov	v1, t0
    126 	sub	u1, t0
    127 	je	L(end)
    128 
    129 	xor	t1, t1
    130 	rep;bsf	t0, cnt		C tzcnt!
    131 	mov	u0, s0
    132 	mov	u1, s1
    133 	mov	u1, u0
    134 	xor	u1, u1
    135 	sub	v1, u0
    136 	jmp	L(bck)
    137 
    138 L(end):	C mov	v0, %rax
    139 	C mov	v1, %rdx
    140 	FUNC_EXIT()
    141 	ret
    142 EPILOGUE()
    143