Home | History | Annotate | Line # | Download | only in bd1
mul_1.asm revision 1.1.1.3
      1 dnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
      2 
      3 dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	     cycles/limb
     34 C AMD K8,K9      3.65
     35 C AMD K10        3.30    3.68
     36 C AMD bull       4.04    4.29
     37 C AMD pile       4.33
     38 C AMD steam
     39 C AMD excavator
     40 C AMD bobcat     5.73
     41 C AMD jaguar     5.87
     42 C Intel P4      12.5
     43 C Intel core2    4.38
     44 C Intel NHM      4.28
     45 C Intel SBR      2.69
     46 C Intel IBR      2.55
     47 C Intel HWL      2.41
     48 C Intel BWL      2.49
     49 C Intel SKL      2.50
     50 C Intel atom    20.3
     51 C Intel SLM      7.8
     52 C VIA nano       4.25
     53 
     54 C The loop of this code is the result of running a code generation and
     55 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
     56 
     57 C TODO
     58 C  * Move loop code into feed-in blocks, to save insn for zeroing regs.
     59 
     60 define(`rp',      `%rdi')   C rcx
     61 define(`up',      `%rsi')   C rdx
     62 define(`n_param', `%rdx')   C r8
     63 define(`v0',      `%rcx')   C r9
     64 
     65 define(`n',       `%rbx')
     66 
     67 ABI_SUPPORT(DOS64)
     68 ABI_SUPPORT(STD64)
     69 
     70 IFDOS(`	define(`up', ``%rsi'')	') dnl
     71 IFDOS(`	define(`rp', ``%rcx'')	') dnl
     72 IFDOS(`	define(`v0', ``%r9'')	') dnl
     73 IFDOS(`	define(`r9', ``rdi'')	') dnl
     74 IFDOS(`	define(`n',  ``%r8'')	') dnl
     75 IFDOS(`	define(`r8', ``rbx'')	') dnl
     76 
     77 ASM_START()
     78 	TEXT
     79 	ALIGN(16)
     80 PROLOGUE(mpn_mul_1c)
     81 IFDOS(``push	%rsi		'')
     82 IFDOS(``push	%rdi		'')
     83 IFDOS(``mov	%rdx, %rsi	'')
     84 
     85 	mov	(up), %rax		C read first u limb early
     86 	push	%rbx
     87 IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
     88 IFDOS(`	mov	n, %r11		')
     89 	mul	v0
     90 
     91 IFSTD(` add	%r8, %rax	')
     92 IFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
     93 	adc	$0, %rdx
     94 	jmp	L(common)
     95 
     96 EPILOGUE()
     97 
     98 	ALIGN(16)
     99 PROLOGUE(mpn_mul_1)
    100 IFDOS(``push	%rsi		'')
    101 IFDOS(``push	%rdi		'')
    102 IFDOS(``mov	%rdx, %rsi	'')
    103 
    104 	mov	(up), %rax		C read first u limb early
    105 	push	%rbx
    106 IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
    107 IFDOS(`	mov	n, %r11		')
    108 	mul	v0
    109 
    110 L(common):
    111 IFSTD(`	mov	%r11, n		')
    112 
    113 	and	$3, R32(%r11)
    114 	lea	-16(rp,n,8), rp
    115 	jz	L(b0)
    116 	cmp	$2, R32(%r11)
    117 	jb	L(b1)
    118 	jz	L(b2)
    119 
    120 L(b3):	mov	%rax, %r10
    121 	mov	%rdx, %r11
    122 	mov	8(up), %rax
    123 	mul	v0
    124 	lea	(up,n,8), up
    125 	not	n
    126 	jmp	L(L3)
    127 
    128 L(b0):	mov	%rax, %r9
    129 	mov	%rdx, %r10
    130 	mov	8(up), %rax
    131 	lea	(up,n,8), up
    132 	neg	n
    133 	jmp	L(L0)
    134 
    135 L(b1):	mov	%rax, %r8
    136 	cmp	$1, n
    137 	jz	L(n1)
    138 	mov	%rdx, %r9
    139 	lea	(up,n,8), up
    140 	neg	n
    141 	mov	%r8, 16(rp,n,8)
    142 	inc	n
    143 	jmp	L(L1)
    144 
    145 L(b2):	mov	%rax, %r11
    146 	mov	%rdx, %r8
    147 	mov	8(up), %rax
    148 	lea	(up,n,8), up
    149 	neg	n
    150 	add	$2, n
    151 	jns	L(end)
    152 
    153 	ALIGN(16)
    154 L(top):	mul	v0
    155 	mov	%rdx, %r9
    156 	add	%rax, %r8
    157 	adc	$0, %r9
    158 	mov	%r8, 8(rp,n,8)
    159 	mov	%r11, (rp,n,8)
    160 L(L1):	mov	(up,n,8), %rax
    161 	mul	v0
    162 	add	%rax, %r9
    163 	mov	%rdx, %r10
    164 	mov	8(up,n,8), %rax
    165 	adc	$0, %r10
    166 L(L0):	mul	v0
    167 	add	%rax, %r10
    168 	mov	%rdx, %r11
    169 	mov	16(up,n,8), %rax
    170 	adc	$0, %r11
    171 	mul	v0
    172 	mov	%r9, 16(rp,n,8)
    173 L(L3):	add	%rax, %r11
    174 	mov	%r10, 24(rp,n,8)
    175 	mov	%rdx, %r8
    176 	adc	$0, %r8
    177 	add	$4, n
    178 	mov	-8(up,n,8), %rax
    179 	js	L(top)
    180 
    181 L(end):	mul	v0
    182 	add	%rax, %r8
    183 	adc	$0, %rdx
    184 	mov	%r11, (rp)
    185 L(n1):	mov	%r8, 8(rp)
    186 	mov	%rdx, %rax
    187 
    188 	pop	%rbx
    189 IFDOS(``pop	%rdi		'')
    190 IFDOS(``pop	%rsi		'')
    191 	ret
    192 EPILOGUE()
    193 ASM_END()
    194