Home | History | Annotate | Line # | Download | only in bd1
      1 dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
      2 
      3 dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	     cycles/limb
     34 C AMD K8,K9      3.30    3.58
     35 C AMD K10        3.09
     36 C AMD bull       4.47    4.72
     37 C AMD pile       4.66
     38 C AMD steam
     39 C AMD excavator
     40 C AMD bobcat     6.30
     41 C AMD jaguar     6.29
     42 C Intel P4      17.3    17.8
     43 C Intel core2    5.13
     44 C Intel NHM      4.85
     45 C Intel SBR      3.83
     46 C Intel IBR      3.75
     47 C Intel HWL      3.45
     48 C Intel BWL      2.56
     49 C Intel SKL      2.53
     50 C Intel atom    20.3
     51 C Intel SLM      9
     52 C VIA nano
     53 
     54 C The loop of this code is the result of running a code generation and
     55 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
     56 
     57 C TODO
     58 C  * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver.
     59 
     60 define(`rp',      `%rdi')   C rcx
     61 define(`up',      `%rsi')   C rdx
     62 define(`n_param', `%rdx')   C r8
     63 define(`v0',      `%rcx')   C r9
     64 
     65 define(`n',       `%r11')
     66 
     67 ifdef(`OPERATION_addmul_1',`
     68       define(`ADDSUB',        `add')
     69       define(`func',  `mpn_addmul_1')
     70 ')
     71 ifdef(`OPERATION_submul_1',`
     72       define(`ADDSUB',        `sub')
     73       define(`func',  `mpn_submul_1')
     74 ')
     75 
     76 ABI_SUPPORT(DOS64)
     77 ABI_SUPPORT(STD64)
     78 
     79 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
     80 
     81 IFDOS(`	define(`up', ``%rsi'')	') dnl
     82 IFDOS(`	define(`rp', ``%rcx'')	') dnl
     83 IFDOS(`	define(`v0', ``%r9'')	') dnl
     84 IFDOS(`	define(`r9', ``rdi'')	') dnl
     85 IFDOS(`	define(`n',  ``%r8'')	') dnl
     86 IFDOS(`	define(`r8', ``r11'')	') dnl
     87 
     88 ASM_START()
     89 	TEXT
     90 	ALIGN(16)
     91 PROLOGUE(func)
     92 IFDOS(``push	%rsi		'')
     93 IFDOS(``push	%rdi		'')
     94 IFDOS(``mov	%rdx, %rsi	'')
     95 
     96 	mov	(up), %rax		C read first u limb early
     97 	push	%rbx
     98 IFSTD(`	mov	n_param, %rbx	')	C move away n from rdx, mul uses it
     99 IFDOS(`	mov	n, %rbx		')
    100 	mul	v0
    101 
    102 IFSTD(`	mov	%rbx, n		')
    103 
    104 	and	$3, R32(%rbx)
    105 	lea	-16(rp,n,8), rp
    106 	jz	L(b0)
    107 	cmp	$2, R32(%rbx)
    108 	jb	L(b1)
    109 	jz	L(b2)
    110 
    111 L(b3):	mov	$0, R32(%r8)
    112 	mov	%rax, %rbx
    113 	mov	$0, R32(%r9)
    114 	mov	8(up), %rax
    115 	mov	%rdx, %r10
    116 	lea	(up,n,8), up
    117 	not	n
    118 	jmp	L(L3)
    119 
    120 L(b0):	mov	$0, R32(%r10)
    121 	mov	%rax, %r8
    122 	mov	%rdx, %rbx
    123 	mov	8(up), %rax
    124 	lea	(up,n,8), up
    125 	neg	n
    126 	jmp	L(L0)
    127 
    128 L(b1):	cmp	$1, n
    129 	jz	L(n1)
    130 	mov	%rax, %r9
    131 	mov	8(up), %rax
    132 	mov	%rdx, %r8
    133 	mov	$0, R32(%rbx)
    134 	lea	(up,n,8), up
    135 	neg	n
    136 	inc	n
    137 	jmp	L(L1)
    138 
    139 L(b2):	mov	$0, R32(%rbx)
    140 	mov	%rax, %r10
    141 	mov	%rdx, %r9
    142 	mov	8(up), %rax
    143 	mov	$0, R32(%r8)
    144 	lea	(up,n,8), up
    145 	neg	n
    146 	add	$2, n
    147 	jns	L(end)
    148 
    149 	ALIGN(32)
    150 L(top):	mul	v0
    151 	ADDSUB	%r10, (rp,n,8)
    152 	adc	%rax, %r9
    153 	mov	(up,n,8), %rax
    154 	adc	%rdx, %r8
    155 L(L1):	mul	v0
    156 	mov	$0, R32(%r10)
    157 	ADDSUB	%r9, 8(rp,n,8)
    158 	adc	%rax, %r8
    159 	adc	%rdx, %rbx
    160 	mov	8(up,n,8), %rax
    161 L(L0):	mul	v0
    162 	ADDSUB	%r8, 16(rp,n,8)
    163 	mov	$0, R32(%r8)
    164 	adc	%rax, %rbx
    165 	mov	$0, R32(%r9)
    166 	mov	16(up,n,8), %rax
    167 	adc	%rdx, %r10
    168 L(L3):	mul	v0
    169 	ADDSUB	%rbx, 24(rp,n,8)
    170 	mov	$0, R32(%rbx)
    171 	adc	%rax, %r10
    172 	adc	%rdx, %r9
    173 	mov	24(up,n,8), %rax
    174 	add	$4, n
    175 	js	L(top)
    176 
    177 L(end):	mul	v0
    178 	ADDSUB	%r10, (rp)
    179 	adc	%r9, %rax
    180 	adc	%r8, %rdx
    181 L(n1):	ADDSUB	%rax, 8(rp)
    182 	adc	$0, %rdx
    183 	mov	%rdx, %rax
    184 
    185 	pop	%rbx
    186 IFDOS(``pop	%rdi		'')
    187 IFDOS(``pop	%rsi		'')
    188 	ret
    189 EPILOGUE()
    190 ASM_END()
    191