Home | History | Annotate | Line # | Download | only in sse2
      1 dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
      2 
      3 dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
      4 
      5 dnl  Copyright 2011 Free Software Foundation, Inc.
      6 
      7 dnl  This file is part of the GNU MP Library.
      8 dnl
      9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10 dnl  it under the terms of either:
     11 dnl
     12 dnl    * the GNU Lesser General Public License as published by the Free
     13 dnl      Software Foundation; either version 3 of the License, or (at your
     14 dnl      option) any later version.
     15 dnl
     16 dnl  or
     17 dnl
     18 dnl    * the GNU General Public License as published by the Free Software
     19 dnl      Foundation; either version 2 of the License, or (at your option) any
     20 dnl      later version.
     21 dnl
     22 dnl  or both in parallel, as here.
     23 dnl
     24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27 dnl  for more details.
     28 dnl
     29 dnl  You should have received copies of the GNU General Public License and the
     30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31 dnl  see https://www.gnu.org/licenses/.
     32 
     33 include(`../config.m4')
     34 
     35 C			    cycles/limb
     36 C			    cycles/limb
     37 C P5				 -
     38 C P6 model 0-8,10-12		 -
     39 C P6 model 9  (Banias)
     40 C P6 model 13 (Dothan)
     41 C P4 model 0  (Willamette)
     42 C P4 model 1  (?)
     43 C P4 model 2  (Northwood)
     44 C P4 model 3  (Prescott)
     45 C P4 model 4  (Nocona)
     46 C Intel Atom			 8
     47 C AMD K6
     48 C AMD K7			 -
     49 C AMD K8
     50 C AMD K10
     51 
     52 define(`rp', `%edi')
     53 define(`up', `%esi')
     54 define(`n',  `%ecx')
     55 
     56 ifdef(`OPERATION_addmul_1',`
     57 	define(ADDSUB,  add)
     58 	define(func_1,  mpn_addmul_1)
     59 	define(func_1c, mpn_addmul_1c)')
     60 ifdef(`OPERATION_submul_1',`
     61 	define(ADDSUB,  sub)
     62 	define(func_1,  mpn_submul_1)
     63 	define(func_1c, mpn_submul_1c)')
     64 
     65 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
     66 
     67 	TEXT
     68 	ALIGN(16)
     69 PROLOGUE(func_1)
     70 	xor	%edx, %edx
     71 L(ent):	push	%edi
     72 	push	%esi
     73 	push	%ebx
     74 	mov	16(%esp), rp
     75 	mov	20(%esp), up
     76 	mov	24(%esp), n
     77 	movd	28(%esp), %mm7
     78 	test	$1, n
     79 	jz	L(fi0or2)
     80 	movd	(up), %mm0
     81 	pmuludq	%mm7, %mm0
     82 	shr	$2, n
     83 	jnc	L(fi1)
     84 
     85 L(fi3):	lea	-8(up), up
     86 	lea	-8(rp), rp
     87 	movd	12(up), %mm1
     88 	movd	%mm0, %ebx
     89 	pmuludq	%mm7, %mm1
     90 	add	$1, n			C increment and clear carry
     91 	jmp	L(lo3)
     92 
     93 L(fi1):	movd	%mm0, %ebx
     94 	jz	L(wd1)
     95 	movd	4(up), %mm1
     96 	pmuludq	%mm7, %mm1
     97 	jmp	L(lo1)
     98 
     99 L(fi0or2):
    100 	movd	(up), %mm1
    101 	pmuludq	%mm7, %mm1
    102 	shr	$2, n
    103 	movd	4(up), %mm0
    104 	jc	L(fi2)
    105 	lea	-4(up), up
    106 	lea	-4(rp), rp
    107 	movd	%mm1, %eax
    108 	pmuludq	%mm7, %mm0
    109 	jmp	L(lo0)
    110 
    111 L(fi2):	lea	4(up), up
    112 	add	$1, n			C increment and clear carry
    113 	movd	%mm1, %eax
    114 	lea	-12(rp), rp
    115 	jmp	L(lo2)
    116 
    117 C	ALIGN(16)			C alignment seems irrelevant
    118 L(top):	movd	4(up), %mm1
    119 	adc	$0, %edx
    120 	ADDSUB	%eax, 12(rp)
    121 	movd	%mm0, %ebx
    122 	pmuludq	%mm7, %mm1
    123 	lea	16(rp), rp
    124 L(lo1):	psrlq	$32, %mm0
    125 	adc	%edx, %ebx
    126 	movd	%mm0, %edx
    127 	movd	%mm1, %eax
    128 	movd	8(up), %mm0
    129 	pmuludq	%mm7, %mm0
    130 	adc	$0, %edx
    131 	ADDSUB	%ebx, (rp)
    132 L(lo0):	psrlq	$32, %mm1
    133 	adc	%edx, %eax
    134 	movd	%mm1, %edx
    135 	movd	%mm0, %ebx
    136 	movd	12(up), %mm1
    137 	pmuludq	%mm7, %mm1
    138 	adc	$0, %edx
    139 	ADDSUB	%eax, 4(rp)
    140 L(lo3):	psrlq	$32, %mm0
    141 	adc	%edx, %ebx
    142 	movd	%mm0, %edx
    143 	movd	%mm1, %eax
    144 	lea	16(up), up
    145 	movd	(up), %mm0
    146 	adc	$0, %edx
    147 	ADDSUB	%ebx, 8(rp)
    148 L(lo2):	psrlq	$32, %mm1
    149 	adc	%edx, %eax
    150 	movd	%mm1, %edx
    151 	pmuludq	%mm7, %mm0
    152 	dec	n
    153 	jnz	L(top)
    154 
    155 L(end):	adc	n, %edx			C n is zero here
    156 	ADDSUB	%eax, 12(rp)
    157 	movd	%mm0, %ebx
    158 	lea	16(rp), rp
    159 L(wd1):	psrlq	$32, %mm0
    160 	adc	%edx, %ebx
    161 	movd	%mm0, %eax
    162 	adc	n, %eax
    163 	ADDSUB	%ebx, (rp)
    164 	emms
    165 	adc	n, %eax
    166 	pop	%ebx
    167 	pop	%esi
    168 	pop	%edi
    169 	ret
    170 EPILOGUE()
    171 PROLOGUE(func_1c)
    172 	mov	20(%esp), %edx		C carry
    173 	jmp	L(ent)
    174 EPILOGUE()
    175