Home | History | Annotate | Line # | Download | only in bt1
      1 dnl  AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
      2 
      3 dnl  Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	    cycles/limb
     34 C AMD K8,K9	 1.77
     35 C AMD K10	 1.76\1.82
     36 C AMD bd1	 1.67\2.12
     37 C AMD bd2	 1.62\1.82
     38 C AMD bd3
     39 C AMD bd4	 1.55\2.2
     40 C AMD zen
     41 C AMD bt1	 2.54
     42 C AMD bt2	 2
     43 C Intel P4	11
     44 C Intel PNR	 4.76
     45 C Intel NHM	 5.27
     46 C Intel SBR	 2
     47 C Intel IBR	 1.94
     48 C Intel HWL	 1.63
     49 C Intel BWL	 1.51
     50 C Intel SKL	 1.51
     51 C Intel atom	 3.56
     52 C Intel SLM	 4
     53 C VIA nano
     54 
     55 C The loop of this code is the result of running a code generation and
     56 C optimization tool suite written by David Harvey and Torbjorn Granlund.
     57 
     58 C INPUT PARAMETERS
     59 define(`rp',	`%rdi')	C rcx
     60 define(`up',	`%rsi')	C rdx
     61 define(`vp',	`%rdx')	C r8
     62 define(`n',	`%rcx')	C r9
     63 define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
     64 
     65 ifdef(`OPERATION_add_n', `
     66 	define(ADCSBB,	      adc)
     67 	define(func,	      mpn_add_n)
     68 	define(func_nc,	      mpn_add_nc)')
     69 ifdef(`OPERATION_sub_n', `
     70 	define(ADCSBB,	      sbb)
     71 	define(func,	      mpn_sub_n)
     72 	define(func_nc,	      mpn_sub_nc)')
     73 
     74 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
     75 
     76 ABI_SUPPORT(DOS64)
     77 ABI_SUPPORT(STD64)
     78 
     79 ASM_START()
     80 	TEXT
     81 	ALIGN(16)
     82 PROLOGUE(func)
     83 	FUNC_ENTRY(4)
     84 	xor	%r8, %r8
     85 L(ent):	test	$1, R8(n)
     86 	jnz	L(bx1)
     87 
     88 L(bx0):	test	$2, R8(n)
     89 	jnz	L(b10)
     90 
     91 L(b00):	shr	$2, n
     92 	neg	%r8
     93 	mov	$3, R32(%rax)
     94 	mov	(up), %r10
     95 	mov	8(up), %r11
     96 	jmp	L(lo0)
     97 
     98 L(b10):	shr	$2, n
     99 	neg	%r8
    100 	mov	$1, R32(%rax)
    101 	mov	(up), %r8
    102 	mov	8(up), %r9
    103 	jrcxz	L(cj2)
    104 	jmp	L(top)
    105 
    106 L(bx1):	test	$2, R8(n)
    107 	jnz	L(b11)
    108 
    109 L(b01):	shr	$2, n
    110 	neg	%r8
    111 	mov	$0, R32(%rax)
    112 	mov	(up), %r9
    113 	jrcxz	L(cj1)
    114 	mov	8(up), %r10
    115 	jmp	L(lo1)
    116 
    117 	ALIGN(8)
    118 L(b11):	inc	n
    119 	shr	$2, n
    120 	neg	%r8
    121 	mov	$2, R32(%rax)
    122 	mov	(up), %r11
    123 	jmp	L(lo3)
    124 
    125 	ALIGN(4)
    126 L(top):	mov	8(up,%rax,8), %r10
    127 	ADCSBB	-8(vp,%rax,8), %r8
    128 	mov	%r8, -8(rp,%rax,8)
    129 L(lo1):	mov	16(up,%rax,8), %r11
    130 	ADCSBB	(vp,%rax,8), %r9
    131 	lea	4(%rax), %rax
    132 	mov	%r9, -32(rp,%rax,8)
    133 L(lo0):	ADCSBB	-24(vp,%rax,8), %r10
    134 	mov	%r10, -24(rp,%rax,8)
    135 L(lo3):	ADCSBB	-16(vp,%rax,8), %r11
    136 	dec	n
    137 	mov	-8(up,%rax,8), %r8
    138 	mov	%r11, -16(rp,%rax,8)
    139 L(lo2):	mov	(up,%rax,8), %r9
    140 	jnz	L(top)
    141 
    142 L(cj2):	ADCSBB	-8(vp,%rax,8), %r8
    143 	mov	%r8, -8(rp,%rax,8)
    144 L(cj1):	ADCSBB	(vp,%rax,8), %r9
    145 	mov	%r9, (rp,%rax,8)
    146 
    147 	mov	$0, R32(%rax)
    148 	adc	$0, R32(%rax)
    149 
    150 	FUNC_EXIT()
    151 	ret
    152 EPILOGUE()
    153 
    154 	ALIGN(16)
    155 PROLOGUE(func_nc)
    156 	FUNC_ENTRY(4)
    157 IFDOS(`	mov	56(%rsp), %r8	')
    158 	jmp	L(ent)
    159 EPILOGUE()
    160