Home | History | Annotate | Line # | Download | only in bd1
mul_1.asm revision 1.1.1.1.4.2
      1 dnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
      2 
      3 dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012 Free Software
      4 dnl  Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of the GNU Lesser General Public License as published
     10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11 dnl  your option) any later version.
     12 
     13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16 dnl  License for more details.
     17 
     18 dnl  You should have received a copy of the GNU Lesser General Public License
     19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20 
     21 include(`../config.m4')
     22 
     23 C	     cycles/limb
     24 C AMD K8,K9
     25 C AMD K10
     26 C AMD bd1	 4
     27 C AMD bobcat
     28 C Intel P4
     29 C Intel core2
     30 C Intel NHM
     31 C Intel SBR
     32 C Intel atom
     33 C VIA nano
     34 
     35 C The loop of this code is the result of running a code generation and
     36 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
     37 
     38 C TODO
     39 C  * Move loop code into feed-in blocks, to save insn for zeroing regs.
     40 
     41 define(`rp',      `%rdi')   C rcx
     42 define(`up',      `%rsi')   C rdx
     43 define(`n_param', `%rdx')   C r8
     44 define(`v0',      `%rcx')   C r9
     45 
     46 define(`n',       `%rbx')
     47 
     48 ABI_SUPPORT(DOS64)
     49 ABI_SUPPORT(STD64)
     50 
     51 IFDOS(`	define(`up', ``%rsi'')	') dnl
     52 IFDOS(`	define(`rp', ``%rcx'')	') dnl
     53 IFDOS(`	define(`v0', ``%r9'')	') dnl
     54 IFDOS(`	define(`r9', ``rdi'')	') dnl
     55 IFDOS(`	define(`n',  ``%r8'')	') dnl
     56 IFDOS(`	define(`r8', ``rbx'')	') dnl
     57 
     58 ASM_START()
     59 	TEXT
     60 	ALIGN(16)
     61 PROLOGUE(mpn_mul_1c)
     62 IFDOS(``push	%rsi		'')
     63 IFDOS(``push	%rdi		'')
     64 IFDOS(``mov	%rdx, %rsi	'')
     65 
     66 	mov	(up), %rax		C read first u limb early
     67 	push	%rbx
     68 IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
     69 IFDOS(`	mov	n, %r11		')
     70 	mul	v0
     71 
     72 IFSTD(` add	%r8, %rax	')
     73 IFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
     74 	adc	$0, %rdx
     75 	jmp	L(common)
     76 
     77 EPILOGUE()
     78 
     79 	ALIGN(16)
     80 PROLOGUE(mpn_mul_1)
     81 IFDOS(``push	%rsi		'')
     82 IFDOS(``push	%rdi		'')
     83 IFDOS(``mov	%rdx, %rsi	'')
     84 
     85 	mov	(up), %rax		C read first u limb early
     86 	push	%rbx
     87 IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
     88 IFDOS(`	mov	n, %r11		')
     89 	mul	v0
     90 
     91 L(common):
     92 IFSTD(`	mov	%r11, n		')
     93 
     94 	and	$3, R32(%r11)
     95 	lea	-16(rp,n,8), rp
     96 	jz	L(b0)
     97 	cmp	$2, R32(%r11)
     98 	jb	L(b1)
     99 	jz	L(b2)
    100 
    101 L(b3):	mov	%rax, %r10
    102 	mov	%rdx, %r11
    103 	mov	8(up), %rax
    104 	mul	v0
    105 	lea	(up,n,8), up
    106 	not	n
    107 	jmp	L(L3)
    108 
    109 L(b0):	mov	%rax, %r9
    110 	mov	%rdx, %r10
    111 	mov	8(up), %rax
    112 	lea	(up,n,8), up
    113 	neg	n
    114 	jmp	L(L0)
    115 
    116 L(b1):	mov	%rax, %r8
    117 	cmp	$1, n
    118 	jz	L(n1)
    119 	mov	%rdx, %r9
    120 	lea	(up,n,8), up
    121 	neg	n
    122 	mov	%r8, 16(rp,n,8)
    123 	inc	n
    124 	jmp	L(L1)
    125 
    126 L(b2):	mov	%rax, %r11
    127 	mov	%rdx, %r8
    128 	mov	8(up), %rax
    129 	lea	(up,n,8), up
    130 	neg	n
    131 	add	$2, n
    132 	jns	L(end)
    133 
    134 	ALIGN(16)
    135 L(top):	mul	v0
    136 	mov	%rdx, %r9
    137 	add	%rax, %r8
    138 	adc	$0, %r9
    139 	mov	%r8, 8(rp,n,8)
    140 	mov	%r11, (rp,n,8)
    141 L(L1):	mov	(up,n,8), %rax
    142 	mul	v0
    143 	add	%rax, %r9
    144 	mov	%rdx, %r10
    145 	mov	8(up,n,8), %rax
    146 	adc	$0, %r10
    147 L(L0):	mul	v0
    148 	add	%rax, %r10
    149 	mov	%rdx, %r11
    150 	mov	16(up,n,8), %rax
    151 	adc	$0, %r11
    152 	mul	v0
    153 	mov	%r9, 16(rp,n,8)
    154 L(L3):	add	%rax, %r11
    155 	mov	%r10, 24(rp,n,8)
    156 	mov	%rdx, %r8
    157 	adc	$0, %r8
    158 	add	$4, n
    159 	mov	-8(up,n,8), %rax
    160 	js	L(top)
    161 
    162 L(end):	mul	v0
    163 	add	%rax, %r8
    164 	adc	$0, %rdx
    165 	mov	%r11, (rp)
    166 L(n1):	mov	%r8, 8(rp)
    167 	mov	%rdx, %rax
    168 
    169 	pop	%rbx
    170 IFDOS(``pop	%rdi		'')
    171 IFDOS(``pop	%rsi		'')
    172 	ret
    173 EPILOGUE()
    174 ASM_END()
    175