Home | History | Annotate | Line # | Download | only in k7
      1 dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
      2 
      3 dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
      4 
      5 dnl  This file is part of the GNU MP Library.
      6 dnl
      7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8 dnl  it under the terms of either:
      9 dnl
     10 dnl    * the GNU Lesser General Public License as published by the Free
     11 dnl      Software Foundation; either version 3 of the License, or (at your
     12 dnl      option) any later version.
     13 dnl
     14 dnl  or
     15 dnl
     16 dnl    * the GNU General Public License as published by the Free Software
     17 dnl      Foundation; either version 2 of the License, or (at your option) any
     18 dnl      later version.
     19 dnl
     20 dnl  or both in parallel, as here.
     21 dnl
     22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25 dnl  for more details.
     26 dnl
     27 dnl  You should have received copies of the GNU General Public License and the
     28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29 dnl  see https://www.gnu.org/licenses/.
     30 
     31 include(`../config.m4')
     32 
     33 
     34 C			    cycles/limb
     35 C P5
     36 C P6 model 0-8,10-12
     37 C P6 model 9  (Banias)		 6.5
     38 C P6 model 13 (Dothan)
     39 C P4 model 0  (Willamette)
     40 C P4 model 1  (?)
     41 C P4 model 2  (Northwood)
     42 C P4 model 3  (Prescott)
     43 C P4 model 4  (Nocona)
     44 C AMD K6
     45 C AMD K7			 3.75
     46 C AMD K8
     47 
     48 C TODO
     49 C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
     50 C    but lose by 2x for n == 1.
     51 
     52 ifdef(`OPERATION_addmul_1',`
     53       define(`ADDSUB',        `add')
     54       define(`func',  `mpn_addmul_1')
     55 ')
     56 ifdef(`OPERATION_submul_1',`
     57       define(`ADDSUB',        `sub')
     58       define(`func',  `mpn_submul_1')
     59 ')
     60 
     61 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
     62 
     63 ASM_START()
     64 	TEXT
     65 	ALIGN(16)
     66 PROLOGUE(func)
     67 	add	$-16, %esp
     68 	mov	%ebp, (%esp)
     69 	mov	%ebx, 4(%esp)
     70 	mov	%esi, 8(%esp)
     71 	mov	%edi, 12(%esp)
     72 
     73 	mov	20(%esp), %edi
     74 	mov	24(%esp), %esi
     75 	mov	28(%esp), %eax
     76 	mov	32(%esp), %ecx
     77 	mov	%eax, %ebx
     78 	shr	$2, %eax
     79 	mov	%eax, 28(%esp)
     80 	mov	(%esi), %eax
     81 	and	$3, %ebx
     82 	jz	L(b0)
     83 	cmp	$2, %ebx
     84 	jz	L(b2)
     85 	jg	L(b3)
     86 
     87 L(b1):	lea	-4(%esi), %esi
     88 	lea	-4(%edi), %edi
     89 	mul	%ecx
     90 	mov	%eax, %ebx
     91 	mov	%edx, %ebp
     92 	cmpl	$0, 28(%esp)
     93 	jz	L(cj1)
     94 	mov	8(%esi), %eax
     95 	jmp	L(1)
     96 
     97 L(b2):	mul	%ecx
     98 	mov	%eax, %ebp
     99 	mov	4(%esi), %eax
    100 	mov	%edx, %ebx
    101 	cmpl	$0, 28(%esp)
    102 	jne	L(2)
    103 	jmp	L(cj2)
    104 
    105 L(b3):	lea	-12(%esi), %esi
    106 	lea	-12(%edi), %edi
    107 	mul	%ecx
    108 	mov	%eax, %ebx
    109 	mov	%edx, %ebp
    110 	mov	16(%esi), %eax
    111 	incl	28(%esp)
    112 	jmp	L(3)
    113 
    114 L(b0):	lea	-8(%esi), %esi
    115 	lea	-8(%edi), %edi
    116 	mul	%ecx
    117 	mov	%eax, %ebp
    118 	mov	12(%esi), %eax
    119 	mov	%edx, %ebx
    120 	jmp	L(0)
    121 
    122 	ALIGN(16)
    123 L(top):	lea	16(%edi), %edi
    124 L(2):	mul	%ecx
    125 	ADDSUB	%ebp, 0(%edi)
    126 	mov	$0, %ebp
    127 	adc	%eax, %ebx
    128 	mov	8(%esi), %eax
    129 	adc	%edx, %ebp
    130 L(1):	mul	%ecx
    131 	ADDSUB	%ebx, 4(%edi)
    132 	mov	$0, %ebx
    133 	adc	%eax, %ebp
    134 	mov	12(%esi), %eax
    135 	adc	%edx, %ebx
    136 L(0):	mul	%ecx
    137 	ADDSUB	%ebp, 8(%edi)
    138 	mov	$0, %ebp
    139 	adc	%eax, %ebx
    140 	adc	%edx, %ebp
    141 	mov	16(%esi), %eax
    142 L(3):	mul	%ecx
    143 	ADDSUB	%ebx, 12(%edi)
    144 	adc	%eax, %ebp
    145 	mov	20(%esi), %eax
    146 	lea	16(%esi), %esi
    147 	mov	$0, %ebx
    148 	adc	%edx, %ebx
    149 	decl	28(%esp)
    150 	jnz	L(top)
    151 
    152 L(end):	lea	16(%edi), %edi
    153 L(cj2):	mul	%ecx
    154 	ADDSUB	%ebp, (%edi)
    155 	adc	%eax, %ebx
    156 	adc	$0, %edx
    157 L(cj1):	ADDSUB	%ebx, 4(%edi)
    158 	adc	$0, %edx
    159 	mov	%edx, %eax
    160 	mov	(%esp), %ebp
    161 	mov	4(%esp), %ebx
    162 	mov	8(%esp), %esi
    163 	mov	12(%esp), %edi
    164 	add	$16, %esp
    165 	ret
    166 EPILOGUE()
    167 ASM_END()
    168