Home | History | Annotate | Line # | Download | only in bd1
mul_1.asm revision 1.1.1.2
      1 dnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
      2 
      3 dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 C	     cycles/limb
     34 C AMD K8,K9
     35 C AMD K10
     36 C AMD bd1	 4
     37 C AMD bobcat
     38 C Intel P4
     39 C Intel core2
     40 C Intel NHM
     41 C Intel SBR
     42 C Intel atom
     43 C VIA nano
     44 
     45 C The loop of this code is the result of running a code generation and
     46 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
     47 
     48 C TODO
     49 C  * Move loop code into feed-in blocks, to save insn for zeroing regs.
     50 
     51 define(`rp',      `%rdi')   C rcx
     52 define(`up',      `%rsi')   C rdx
     53 define(`n_param', `%rdx')   C r8
     54 define(`v0',      `%rcx')   C r9
     55 
     56 define(`n',       `%rbx')
     57 
     58 ABI_SUPPORT(DOS64)
     59 ABI_SUPPORT(STD64)
     60 
     61 IFDOS(`	define(`up', ``%rsi'')	') dnl
     62 IFDOS(`	define(`rp', ``%rcx'')	') dnl
     63 IFDOS(`	define(`v0', ``%r9'')	') dnl
     64 IFDOS(`	define(`r9', ``rdi'')	') dnl
     65 IFDOS(`	define(`n',  ``%r8'')	') dnl
     66 IFDOS(`	define(`r8', ``rbx'')	') dnl
     67 
     68 ASM_START()
     69 	TEXT
     70 	ALIGN(16)
     71 PROLOGUE(mpn_mul_1c)
     72 IFDOS(``push	%rsi		'')
     73 IFDOS(``push	%rdi		'')
     74 IFDOS(``mov	%rdx, %rsi	'')
     75 
     76 	mov	(up), %rax		C read first u limb early
     77 	push	%rbx
     78 IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
     79 IFDOS(`	mov	n, %r11		')
     80 	mul	v0
     81 
     82 IFSTD(` add	%r8, %rax	')
     83 IFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
     84 	adc	$0, %rdx
     85 	jmp	L(common)
     86 
     87 EPILOGUE()
     88 
     89 	ALIGN(16)
     90 PROLOGUE(mpn_mul_1)
     91 IFDOS(``push	%rsi		'')
     92 IFDOS(``push	%rdi		'')
     93 IFDOS(``mov	%rdx, %rsi	'')
     94 
     95 	mov	(up), %rax		C read first u limb early
     96 	push	%rbx
     97 IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
     98 IFDOS(`	mov	n, %r11		')
     99 	mul	v0
    100 
    101 L(common):
    102 IFSTD(`	mov	%r11, n		')
    103 
    104 	and	$3, R32(%r11)
    105 	lea	-16(rp,n,8), rp
    106 	jz	L(b0)
    107 	cmp	$2, R32(%r11)
    108 	jb	L(b1)
    109 	jz	L(b2)
    110 
    111 L(b3):	mov	%rax, %r10
    112 	mov	%rdx, %r11
    113 	mov	8(up), %rax
    114 	mul	v0
    115 	lea	(up,n,8), up
    116 	not	n
    117 	jmp	L(L3)
    118 
    119 L(b0):	mov	%rax, %r9
    120 	mov	%rdx, %r10
    121 	mov	8(up), %rax
    122 	lea	(up,n,8), up
    123 	neg	n
    124 	jmp	L(L0)
    125 
    126 L(b1):	mov	%rax, %r8
    127 	cmp	$1, n
    128 	jz	L(n1)
    129 	mov	%rdx, %r9
    130 	lea	(up,n,8), up
    131 	neg	n
    132 	mov	%r8, 16(rp,n,8)
    133 	inc	n
    134 	jmp	L(L1)
    135 
    136 L(b2):	mov	%rax, %r11
    137 	mov	%rdx, %r8
    138 	mov	8(up), %rax
    139 	lea	(up,n,8), up
    140 	neg	n
    141 	add	$2, n
    142 	jns	L(end)
    143 
    144 	ALIGN(16)
    145 L(top):	mul	v0
    146 	mov	%rdx, %r9
    147 	add	%rax, %r8
    148 	adc	$0, %r9
    149 	mov	%r8, 8(rp,n,8)
    150 	mov	%r11, (rp,n,8)
    151 L(L1):	mov	(up,n,8), %rax
    152 	mul	v0
    153 	add	%rax, %r9
    154 	mov	%rdx, %r10
    155 	mov	8(up,n,8), %rax
    156 	adc	$0, %r10
    157 L(L0):	mul	v0
    158 	add	%rax, %r10
    159 	mov	%rdx, %r11
    160 	mov	16(up,n,8), %rax
    161 	adc	$0, %r11
    162 	mul	v0
    163 	mov	%r9, 16(rp,n,8)
    164 L(L3):	add	%rax, %r11
    165 	mov	%r10, 24(rp,n,8)
    166 	mov	%rdx, %r8
    167 	adc	$0, %r8
    168 	add	$4, n
    169 	mov	-8(up,n,8), %rax
    170 	js	L(top)
    171 
    172 L(end):	mul	v0
    173 	add	%rax, %r8
    174 	adc	$0, %rdx
    175 	mov	%r11, (rp)
    176 L(n1):	mov	%r8, 8(rp)
    177 	mov	%rdx, %rax
    178 
    179 	pop	%rbx
    180 IFDOS(``pop	%rdi		'')
    181 IFDOS(``pop	%rsi		'')
    182 	ret
    183 EPILOGUE()
    184 ASM_END()
    185